MYSQL - Notes - Colab
MYSQL - Notes - Colab
MYSQL - Notes - Colab
Notes the below two code shows how to create database and upload it in colab
keyboard_arrow_down
[ ] ↳ 6 cells hidden
keyboard_arrow_down SQL_Queries
# import sqlite3
# def query_database(query):
# db_path = '/content/airbnb.db'
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()
# cursor.execute(query)
# results = cursor.fetchall()
# cursor.close()
# conn.close()
# return results
def query_database(query):
db_path = '/content/airbnb.db'
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(query)
results = cursor.fetchall()
except sqlite3.Error as e:
print(f"An error occurred: {e}")
results = []
finally:
cursor.close()
conn.close()
return results
import sqlite3
import pandas as pd
def load_data_to_df(query):
db_path = '/content/airbnb.db'
conn = sqlite3.connect(db_path)
df = pd.read_sql_query(query, conn)
conn.close()
return df
Data in airbnb_listings:
id city country number_of_rooms year_listed
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 1/10
04/08/2024, 22:06 MYSQL_Notes - Colab
result = query_database("SELECT id,city FROM airbnb_listings ORDER BY number_of_rooms ASC ")
for listing in result:
print(listing)
(2, 'Tokyo')
(3, 'New York')
(1, 'Paris')
(1, 'Paris')
(2, 'Tokyo')
(3, 'New York')
('Paris',)
('Tokyo',)
('New York',)
keyboard_arrow_down Filtering_DATA
all_listings = query_database("SELECT * FROM airbnb_listings WHERE number_of_rooms >= 3;")
for listing in all_listings:
print(listing)
all_listings = query_database(" SELECT * FROM airbnb_listings WHERE number_of_rooms BETWEEN 3 AND 6")
for listing in all_listings:
print(listing)
('France', 5)
('Japan', 2)
('USA', 2)
('France', 5.0)
('Japan', 2.0)
('USA', 2.0)
('France', 5)
('Japan', 2)
('USA', 2)
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 2/10
04/08/2024, 22:06 MYSQL_Notes - Colab
('France', 5)
('Japan', 2)
('USA', 2)
all_listings = query_database("SELECT country, AVG(number_of_rooms) AS avg_rooms FROM airbnb_listings GROUP BY country ORDER
for listing in all_listings:
print(listing)
('Japan', 2.0)
('USA', 2.0)
('France', 5.0)
all_listings = query_database("SELECT country, MAX(number_of_rooms) FROM airbnb_listings WHERE country IN ('USA', 'Japan') G
for listing in all_listings:
print(listing)
('Japan', 2)
('USA', 2)
('France', 1)
('Japan', 1)
('USA', 1)
all_listings = query_database("SELECT year_listed FROM airbnb_listings GROUP BY year_listed HAVING COUNT (id) > 100;")
for listing in all_listings:
print(listing)
# 9. Get all the listings where the city starts with 'j' and where the city does not end in 't'
all_listings = query_database("SELECT * FROM airbnb_listings WHERE city LIKE 'j%' AND city NOT LIKE '%t';")
for listing in all_listings:
print(listing)
# 10. Get all the listings in 'Paris' where number_of_rooms is bigger than 3
all_listings = query_database("SELECT * FROM airbnb_listings WHERE city = 'Paris' AND number_of_rooms > 3;")
for listing in all_listings:
print(listing)
# 11. Get all the listings in 'Paris' OR the ones that were listed after 2012
all_listings = query_database("SELECT * FROM airbnb_listings WHERE city = 'Paris' OR year_listed > 2012;")
for listing in all_listings:
print(listing)
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 3/10
04/08/2024, 22:06 MYSQL_Notes - Colab
# 13. Return the listings where number_of_rooms is not missing
all_listings = query_database("SELECT * FROM airbnb_listings WHERE number_of_rooms IS NOT NULL;")
for listing in all_listings:
print(listing)
keyboard_arrow_down Aggregating_Data
# 1. Get the total number of rooms available across all listings
all_listings = query_database("SELECT SUM(number_of_rooms) FROM airbnb_listings;")
for listing in all_listings:
print(listing)
(9,)
# 2. Get the average number of rooms per listing across all listings
all_listings = query_database("SELECT AVG (number_of_rooms) FROM airbnb_listings;")
for listing in all_listings:
print(listing)
(3.0,)
# 3. Get the listing with the highest number of rooms across all listings
all_listings = query_database("SELECT MAX(number_of_rooms) FROM airbnb_listings;")
for listing in all_listings:
print(listing)
(5,)
# 4. Get the listing with the lowest number of rooms across all listings
all_listings = query_database("SELECT MIN(number_of_rooms) FROM airbnb_listings;")
for listing in all_listings:
print(listing)
(2,)
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 4/10
04/08/2024, 22:06 MYSQL_Notes - Colab
import sqlite3
cursor.execute('''
CREATE TABLE IF NOT EXISTS Album (
album_id INTEGER PRIMARY KEY,
title TEXT,
artist_id INTEGER,
FOREIGN KEY (artist_id) REFERENCES Artist (artist_id)
)
''')
import os
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 5/10
04/08/2024, 22:06 MYSQL_Notes - Colab
def query_database(query):
db_path = '/content/music.db'
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(query)
results = cursor.fetchall()
except sqlite3.Error as e:
print(f"An error occurred: {e}")
results = []
finally:
cursor.close()
conn.close()
return results
import sqlite3
def query_database(query):
db_path = '/content/music.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(query)
results = cursor.fetchall()
cursor.close()
conn.close()
return results
import sqlite3
import pandas as pd
def load_data_to_df(query):
db_path = '/content/music.db'
conn = sqlite3.connect(db_path)
df = pd.read_sql_query(query, conn)
conn.close()
return df
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 6/10
04/08/2024, 22:06 MYSQL_Notes - Colab
Data in Album:
album_id title artist_id
0 123 AC/DC
1 124 Aerosmith
toggle_off toggle_off
View View
Next Generate New interactive Generate New interactive
df recommended df recommended
steps: code with sheet code with sheet
l t l t
df = load_data_to_df(" SELECT * FROM artist AS art LEFT JOIN album AS alb ON art.artist_id = alb.artist_id;")
display(df)
Next steps: Generate code with df toggle_off View recommended plots New interactive sheet
df = load_data_to_df("SELECT * FROM artist AS art INNER JOIN album AS alb USING (artist_id);")
display(df)
# An inner join between two tables will return only records where a
# joining field, such as a key, finds a match in both tables.
df = load_data_to_df("SELECT * FROM artist AS art INNER JOIN album AS alb ON art.artist_id = alb.artist_id")
display(df)
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 7/10
04/08/2024, 22:06 MYSQL_Notes - Colab
# A left join keeps all of the original records in the left table
# and returns missing values for any columns from the right table
# where the joining field did not find a match.
df = load_data_to_df("SELECT * FROM artist AS art LEFT JOIN album AS alb ON art.artist_id = alb.artist_id;")
display(df)
df = load_data_to_df("SELECT alb1.artist_id, alb1.title AS albi_title, alb2.title AS alb2_title FROM album AS alb1 INNER JO
display(df)
# A right join keeps all of the original records in the right table
# and returns missing values for any columns from the left table
# where the joining field did not find a match. Right joins are far
# less common than left joins, because right joins can always be
# re-written as left joins.
df = load_data_to_df("SELECT * FROM artist as art RIGHT JOIN album AS alb ON art.artist_id = alb.artist_id;")
display(df)
# Note Right join is not supported by sqlite which we are currently using
# A full join combines a left join and right join. A full join
# will return all records from a table, irrespective of whether
# there is a match on the joining field in the other table,
# returning null values accordingly.
# FULL JOIN on one field
df = load_data_to_df(" SELECT * FROM artist as art FULL OUTER JOIN album AS alb ON art.artist_id = alb.artist_id; ")
display(df)
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 8/10
04/08/2024, 22:06 MYSQL_Notes - Colab
# The UNION operator is used to vertically combine the results of two
# SELECT statements. For UNION to work without errors, all SELECT
# statements must have the same number of columns and corresponding
# columns must have the same data type. UNION does not return duplicates.
df = load_data_to_df(" SELECT artist_id FROM artist UNION SELECT artist_id FROM album; ")
display(df)
artist_id
0 123
1 124
2 125
# The anti join chooses records in the first table where a condition is
# NOT met in the second table. It makes use of a WHERE clause to use
# exclude values from the second table.
df = load_data_to_df("SELECT * FROM album WHERE artist_id NOT IN (SELECT artist_id FROM artist);")
display(df)
# CROSS JOIN creates all possible combinations of two tables. CROSS JOIN
# does not require a field to join ON.
name title
1 AC/DC Dream On
4 AC/DC Rumours
6 Aerosmith Dream On
9 Aerosmith Rumours
# The UNION ALL operator works just like UNION, but it returns
# duplicate values. The same restrictions of UNION hold true for UNION ALL
df = load_data_to_df("SELECT artist_id FROM artist UNION ALL SELECT artist_id FROM album;")
display(df)
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 9/10
04/08/2024, 22:06 MYSQL_Notes - Colab
artist_id
0 123
1 124
toggle_off
2 125
Next steps:
3 Generate
123 code with df View recommended plots New interactive sheet
4 124
# The INTERSECT operator returns only identical rows from two tables.
5 125
https://colab.research.google.com/drive/1P3yDvKI2XlFPwQ5y5bbUAVLqBkdAgCfM#scrollTo=gKITIhybsOHL&printMode=true 10/10