For the purpose of keeping the SQL portion of this project available in our notebooks, we'll create an SQLite connection.
Our team at Pillow Palooza has exported the clean and preprocessed data and created a schema for us to use in our analysis. We have also included extra columns (from public data sources) to enhance the analysis.
After meeting with your team, you wrote some questions to guide you along the analysis:
import pandas as pd
import sqlite3
# create the SQLite connection
cnn = sqlite3.connect('palooza.db')
# import the enriched data
prices = pd.read_csv('data_enriched/query_prices.csv')
room_types = pd.read_csv('data_enriched/query_room_types.csv')
reviews = pd.read_csv('data_enriched/query_reviews.csv')
# Save the data to SQLite
prices.to_sql('prices', cnn, if_exists='replace')
room_types.to_sql('room_types', cnn, if_exists='replace')
reviews.to_sql('reviews', cnn, if_exists='replace')
25209
# export for tableau
df = pd.merge(reviews, prices, on='listing_id')
df = pd.merge(df, room_types, on='listing_id')
df.to_csv('enriched_airbnb_rental_data.csv')
df.shape
(25202, 20)
# Load the ipython-sql extension
%load_ext sql
# Connect to the SQLite database
%sql sqlite:///palooza.db
'Connected: @palooza.db'
%%sql
SELECT COUNT(*)
FROM prices;
* sqlite:///palooza.db Done.
| COUNT(*) | 
|---|
| 25202 | 
%%sql
SELECT COUNT(*)
FROM room_types;
* sqlite:///palooza.db Done.
| COUNT(*) | 
|---|
| 25209 | 
%%sql
SELECT COUNT(*)
FROM reviews;
* sqlite:///palooza.db Done.
| COUNT(*) | 
|---|
| 25209 | 
%%sql
SELECT room_type, COUNT(*) type_count
FROM room_types
GROUP BY room_type
ORDER BY 2 DESC
* sqlite:///palooza.db Done.
| room_type | type_count | 
|---|---|
| entire home/apt | 13266 | 
| private room | 11356 | 
| shared room | 587 | 
%%sql
SELECT room_type, ROUND(AVG(price),2) average_price
FROM prices p
JOIN room_types rt 
ON p.listing_id = rt.listing_id
GROUP BY 1
ORDER BY 2 DESC
* sqlite:///palooza.db Done.
| room_type | average_price | 
|---|---|
| entire home/apt | 197.17 | 
| private room | 81.67 | 
| shared room | 53.65 | 
%%sql
SELECT AVG(price * booked_days_365) average_price
FROM prices p
JOIN room_types rt 
ON p.listing_id = rt.listing_id
JOIN reviews r 
ON r.listing_id = rt.listing_id
WHERE room_type = 'entire home/apt'
* sqlite:///palooza.db Done.
| average_price | 
|---|
| 40991.32526760139 | 
%%sql
SELECT borough, ROUND(AVG(price_per_month),2) avg_price_per_month
FROM prices
GROUP BY 1
ORDER BY 2 DESC
* sqlite:///palooza.db Done.
| borough | avg_price_per_month | 
|---|---|
| Manhattan | 5596.69 | 
| Brooklyn | 3710.06 | 
| Queens | 2823.09 | 
| Staten Island | 2617.2 | 
| Bronx | 2410.25 | 
%%sql
SELECT borough, room_type, COUNT(room_type) AS room_count
FROM prices p
JOIN room_types rt 
ON p.listing_id = rt.listing_id
GROUP BY 1, 2
ORDER BY 1 ASC, 3 DESC
* sqlite:///palooza.db Done.
| borough | room_type | room_count | 
|---|---|---|
| Bronx | private room | 403 | 
| Bronx | entire home/apt | 261 | 
| Bronx | shared room | 33 | 
| Brooklyn | entire home/apt | 5367 | 
| Brooklyn | private room | 4906 | 
| Brooklyn | shared room | 187 | 
| Manhattan | entire home/apt | 6170 | 
| Manhattan | private room | 3901 | 
| Manhattan | shared room | 251 | 
| Queens | private room | 2009 | 
| Queens | entire home/apt | 1335 | 
| Queens | shared room | 112 | 
| Staten Island | entire home/apt | 133 | 
| Staten Island | private room | 132 | 
| Staten Island | shared room | 2 | 
%%sql
SELECT room_type, COUNT(price) num_listings
FROM prices p
JOIN room_types rt 
ON p.listing_id = rt.listing_id
WHERE price > 500
GROUP BY 1
* sqlite:///palooza.db Done.
| room_type | num_listings | 
|---|---|
| entire home/apt | 395 | 
| private room | 19 | 
| shared room | 1 | 
%%sql
SELECT borough, 
        MIN(price) AS min_price, 
        MAX(price) AS max_price,
        ROUND(AVG(price),2) AS mean_price
FROM prices
GROUP BY 1
ORDER BY 1
* sqlite:///palooza.db Done.
| borough | min_price | max_price | mean_price | 
|---|---|---|---|
| Bronx | 20 | 670 | 79.24 | 
| Brooklyn | 10 | 7500 | 121.97 | 
| Manhattan | 10 | 5100 | 184.0 | 
| Queens | 10 | 2600 | 92.81 | 
| Staten Island | 13 | 300 | 86.04 | 
%%sql
SELECT *
FROM reviews 
limit 10
* sqlite:///palooza.db Done.
| index | Unnamed: 0 | listing_id | host_name | last_review | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | booked_days_365 | 
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2595 | Jennifer | 2019-05-21 00:00:00 | 1 | 45 | 0.38 | 2 | 355 | 10 | 
| 1 | 1 | 3831 | LisaRoxanne | 2019-07-05 00:00:00 | 1 | 270 | 4.64 | 1 | 194 | 171 | 
| 2 | 2 | 5099 | Chris | 2019-06-22 00:00:00 | 3 | 74 | 0.59 | 1 | 129 | 236 | 
| 3 | 3 | 5178 | Shunichi | 2019-06-24 00:00:00 | 2 | 430 | 3.47 | 1 | 220 | 145 | 
| 4 | 4 | 5238 | Ben | 2019-06-09 00:00:00 | 1 | 160 | 1.33 | 4 | 188 | 177 | 
| 5 | 5 | 5295 | Lena | 2019-06-22 00:00:00 | 5 | 53 | 0.43 | 1 | 6 | 359 | 
| 6 | 6 | 5441 | Kate | 2019-06-23 00:00:00 | 2 | 188 | 1.5 | 1 | 39 | 326 | 
| 7 | 7 | 5803 | Laurie | 2019-06-24 00:00:00 | 4 | 167 | 1.34 | 3 | 314 | 51 | 
| 8 | 8 | 6021 | Claudio | 2019-07-05 00:00:00 | 2 | 113 | 0.91 | 1 | 333 | 32 | 
| 9 | 9 | 6848 | Allen & Irina | 2019-06-29 00:00:00 | 2 | 148 | 1.2 | 1 | 46 | 319 | 
%%sql 
SELECT p.borough, SUM(p.price*r.booked_days_365) AS revenue_per_host
FROM prices p
JOIN reviews r
ON p.listing_id = r.listing_id
GROUP BY 1
ORDER BY 2 DESC
* sqlite:///palooza.db Done.
| borough | revenue_per_host | 
|---|---|
| Manhattan | 393420567 | 
| Brooklyn | 279130240 | 
| Queens | 58404083 | 
| Bronx | 9324180 | 
| Staten Island | 3443919 | 
%%sql 
SELECT neighbourhood, ROUND(AVG(price_per_month),2) AS avg_price
FROM prices
GROUP BY neighbourhood
ORDER BY 2 DESC
LIMIT 5
* sqlite:///palooza.db Done.
| neighbourhood | avg_price | 
|---|---|
| Sea Gate | 24485.42 | 
| Tribeca | 12066.44 | 
| Flatiron District | 10404.19 | 
| NoHo | 10190.33 | 
| SoHo | 9099.52 | 
%%sql
SELECT *
FROM reviews
LIMIT 5
* sqlite:///palooza.db Done.
| index | Unnamed: 0 | listing_id | host_name | last_review | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | booked_days_365 | 
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2595 | Jennifer | 2019-05-21 00:00:00 | 1 | 45 | 0.38 | 2 | 355 | 10 | 
| 1 | 1 | 3831 | LisaRoxanne | 2019-07-05 00:00:00 | 1 | 270 | 4.64 | 1 | 194 | 171 | 
| 2 | 2 | 5099 | Chris | 2019-06-22 00:00:00 | 3 | 74 | 0.59 | 1 | 129 | 236 | 
| 3 | 3 | 5178 | Shunichi | 2019-06-24 00:00:00 | 2 | 430 | 3.47 | 1 | 220 | 145 | 
| 4 | 4 | 5238 | Ben | 2019-06-09 00:00:00 | 1 | 160 | 1.33 | 4 | 188 | 177 | 
%%sql 
SELECT COUNT(r.number_of_reviews) num_listings_no_review
FROM prices p
LEFT JOIN reviews r 
ON p.listing_id = r.listing_id
WHERE r.number_of_reviews IS NULL 
ORDER BY 1 ASC
* sqlite:///palooza.db Done.
| num_listings_no_review | 
|---|
| 0 | 
%%sql 
SELECT
  (COUNT(p.price * r.booked_days_365) - COUNT(p.price) * AVG(p.price) * AVG(r.booked_days_365)) /
  (COUNT(p.price) * AVG(p.price * p.price) - COUNT(p.price) * AVG(p.price) * AVG(p.price)) AS correlation
FROM prices p
JOIN reviews r
ON p.listing_id = r.listing_id
* sqlite:///palooza.db Done.
| correlation | 
|---|
| -1.4278840527031131 | 
%%sql
SELECT rt.room_type, ROUND(AVG(p.price),2) avg_price
FROM prices p 
JOIN reviews r
ON p.listing_id = r.listing_id
JOIN room_types rt
ON rt.listing_id = r.listing_id
WHERE r.availability_365 > 200 AND r.number_of_reviews >= 100 
GROUP BY 1
* sqlite:///palooza.db Done.
| room_type | avg_price | 
|---|---|
| entire home/apt | 179.54 | 
| private room | 85.41 | 
| shared room | 59.19 | 
%%sql
WITH listings AS (SELECT host_name, COUNT(listing_id) num_listings
FROM reviews
GROUP BY host_name
HAVING COUNT(listing_id) > 1
)
SELECT COUNT(host_name) num_hosts, MAX(num_listings) max_listings_per_host
FROM listings
* sqlite:///palooza.db Done.
| num_hosts | max_listings_per_host | 
|---|---|
| 2929 | 215 | 
# Here we aggregated the price_per_month as so that we could see the individual 
# hosts and as to generate the top 5 distinct hosts  
%%sql
SELECT r.host_name, ROUND(AVG(p.price_per_month),2) avg_monthly_price
FROM reviews r
JOIN prices p 
ON r.listing_id = p.listing_id
WHERE calculated_host_listings_count > 9
GROUP BY 1
ORDER BY 2 DESC
LIMIT 5
* sqlite:///palooza.db Done.
| host_name | avg_monthly_price | 
|---|---|
| Sally | 109895.42 | 
| Red Awning | 21452.58 | 
| Park Lane | 12318.75 | 
| Janet | 11334.01 | 
| Yotel | 8749.86 | 
%%sql
WITH avg_prices AS (
    SELECT AVG(price) AS avg_price
    FROM prices
)
SELECT neighbourhood, AVG((price - avg_price) * (price - avg_price)) AS variance
FROM prices, avg_prices
GROUP BY 1
ORDER BY 2 DESC
LIMIT 5
* sqlite:///palooza.db Done.
| neighbourhood | variance | 
|---|---|
| Sea Gate | 902211.2722433434 | 
| Bayside | 224007.989366134 | 
| East Flatbush | 173838.4025865787 | 
| Tribeca | 161899.30747083636 | 
| SoHo | 146980.2407632523 | 
%%sql
SELECT p.neighbourhood, ROUND(AVG(p.price_per_month),2) avg_price_monthly
FROM prices p
JOIN reviews r 
ON p.listing_id = r.listing_id
WHERE r.minimum_nights > (SELECT AVG(minimum_nights) FROM reviews)
GROUP BY 1
ORDER BY 2 DESC 
LIMIT 5
* sqlite:///palooza.db Done.
| neighbourhood | avg_price_monthly | 
|---|---|
| Bayside | 11949.95 | 
| Tribeca | 11791.53 | 
| Flatiron District | 10172.69 | 
| SoHo | 9549.23 | 
| NoHo | 9374.04 |