This notebook is a part of a talk i have been giving to introduce people to SQL. Only pre-requisite to attend this talk is some general curiosity.
This dataset was generated from The Movie Database API. This product uses the TMDb API but is not endorsed or certified by TMDb. We used the dataset published on kaggle that includes 5000 movies
import os
import sqlite3
db_path = os.path.expanduser('~/temp/intro_to_sql/moviesT500_v2.db')
conn = sqlite3.connect(db_path)
c = conn.cursor()
import pandas as pd
import plotly.express as px
from IPython.display import display
def display_all(df):
with pd.option_context("display.max_rows", 1000, "display.max_columns", 300,'display.max_colwidth', -1):
display(df)
# 100% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
sql_query = """
select * from tmdb_5000_movies
limit 5
"""
pd.read_sql_query(sql_query, conn)
sql_query = """
select min(release_date), max(release_date) from tmdb_5000_movies
"""
pd.read_sql_query(sql_query, conn)
first movie was released in 1916 and the last one was released in 2017
sql_query = """
select
date(release_date, 'start of year') as year,
count(*) as movies
from tmdb_5000_movies
group by
date(release_date, 'start of year')
"""
df = pd.read_sql_query(sql_query, conn)
let's plot this data onto a histogram/bar chart
fig = px.bar(
df,
x='year',
y='movies',
)
fig.show()
sql_query = '''
SELECT * from tmdb_5000_movies
WHERE
keywords LIKE '%spider%'
limit 5
'''
df = pd.read_sql_query(sql_query, conn)
df