import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Load "data/airbnb_price.csv" as a DataFrame called prices.

prices = pd.read_csv('data/prices.csv')

# Load "data/airbnb_room_type.xlsx" as a DataFrame called xls, and the first 
# sheet from xls as a DataFrame called room_types.

room_types = pd.read_excel('data/room_types.xlsx')

# Load "data/airbnb_last_review.tsv" as a DataFrame called reviews.

reviews = pd.read_csv('data/reviews.tsv', delimiter='\t')


prices.head(50)


print(prices.shape)
print(prices.info())

(25209, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25209 entries, 0 to 25208
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   listing_id   25209 non-null  int64 
 1   price        25209 non-null  object
 2   nbhood_full  25209 non-null  object
dtypes: int64(1), object(2)
memory usage: 591.0+ KB
None


prices[prices.duplicated()==True]


room_types.head()


print(room_types.shape)
print(room_types.info())

(25209, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25209 entries, 0 to 25208
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   listing_id   25209 non-null  int64 
 1   description  25199 non-null  object
 2   room_type    25209 non-null  object
dtypes: int64(1), object(2)
memory usage: 591.0+ KB
None


room_types[room_types.duplicated()==True]


reviews.head(50)


print(reviews.shape)  
print(reviews.info())

(25209, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25209 entries, 0 to 25208
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   listing_id   25209 non-null  int64 
 1   host_name    25201 non-null  object
 2   last_review  25209 non-null  object
dtypes: int64(1), object(2)
memory usage: 591.0+ KB
None


reviews[reviews.duplicated()==True]


# Cleaning the Price Column:
# using regex to extract all characters which aren't an int '(\d+)'
# converting the datatype of the column from object to float

prices['price'] = prices['price'].str.extract('(\d+)', 
                                              expand=False).astype(float)

prices.head()


prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25209 entries, 0 to 25208
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   listing_id   25209 non-null  int64  
 1   price        25209 non-null  float64
 2   nbhood_full  25209 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 591.0+ KB


# Exclude values equal to 0
prices = prices[prices['price'] != 0]

plt.hist(prices['price'], 100);


# calculating avg_price
avg_price = prices['price'].mean()

print('$', round(avg_price, 2))

$ 141.82


# new column: monthly_price
prices['monthly_price'] = prices['price']*30

# calculate avergae monthly price 
avg_monthly_price = prices['monthly_price'].mean()

print('$', round(avg_monthly_price, 2))

$ 4254.52


# Using regex to and replacing several variations of room type with a standard description. 

room_types['room_type'] = room_types['room_type'].str.lower()

room_types['room_type'].value_counts()

entire home/apt    13266
private room       11356
shared room          587
Name: room_type, dtype: int64


# incorrect date format in reviews['last_review']

reviews['last_review'] = pd.to_datetime(reviews['last_review'])
reviews.head()


# format host name

# Add a space between a lowercase letter and an uppercase letter
reviews['host_name'] = reviews['host_name'].str.replace(r'([a-z])([A-Z])', r'\1 \2')

# Capitalize the first and second words
reviews['host_name'] = reviews['host_name'].str.title()

# Replace '&', 'AND', 'and', 'Or', 'OR', 'And', 'or' with '&'
reviews['host_name'] = reviews['host_name'].str.replace(r'(&| AND | and | Or | OR | And | or )', ' & ')

/var/folders/1m/hvnyjbcx2rxfb9d__rb_m7h00000gp/T/ipykernel_6152/1820784833.py:4: FutureWarning: The default value of regex will change from True to False in a future version.
  reviews['host_name'] = reviews['host_name'].str.replace(r'([a-z])([A-Z])', r'\1 \2')
/var/folders/1m/hvnyjbcx2rxfb9d__rb_m7h00000gp/T/ipykernel_6152/1820784833.py:10: FutureWarning: The default value of regex will change from True to False in a future version.
  reviews['host_name'] = reviews['host_name'].str.replace(r'(&| AND | and | Or | OR | And | or )', ' & ')


df = pd.merge(reviews, prices, on='listing_id')
df = pd.merge(df, room_types, on='listing_id')

df.head()


df['listing_id'] = df['listing_id'].astype('string')

df['description'] = df['description'].fillna('No description')
df['host_name'] = df['host_name'].fillna('Unknown.')

df.isnull().sum()

listing_id       0
host_name        0
last_review      0
price            0
nbhood_full      0
monthly_price    0
description      0
room_type        0
dtype: int64


df['borough'] = df['nbhood_full'].str.split(',').str.get(0)

df.head()


price_ranges = {
    (0, 69): '$0-69',
    (70, 175): '$70-175',
    (176, 350): '$176-350',
    (351, float('inf')): '>$350'
}

def categorize_price(price):
    for price_range, label in price_ranges.items():
        if price >= price_range[0] and price <= price_range[1]:
            return label

df['price_range'] = df['price'].map(categorize_price)
df['labels'] = df['price_range'].replace(price_ranges.values(), ['Budget', 'Average', 'Expensive', 'Extravagant'])



df.head()


# Group the data by 'borough' and 'price_range', and count the frequencies
grouped_df = df.groupby(['borough', 'price_range', 'labels']).size().reset_index(name='frequency')

grouped_df.head(50)


# saving as csv

df.to_csv('airbnb_rental_data.csv', index=False)

	listing_id	price	nbhood_full
0	2595	225 dollars	Manhattan, Midtown
1	3831	89 dollars	Brooklyn, Clinton Hill
2	5099	200 dollars	Manhattan, Murray Hill
3	5178	79 dollars	Manhattan, Hell's Kitchen
4	5238	150 dollars	Manhattan, Chinatown
5	5295	135 dollars	Manhattan, Upper West Side
6	5441	85 dollars	Manhattan, Hell's Kitchen
7	5803	89 dollars	Brooklyn, South Slope
8	6021	85 dollars	Manhattan, Upper West Side
9	6848	140 dollars	Brooklyn, Williamsburg
10	7097	215 dollars	Brooklyn, Fort Greene
11	7322	140 dollars	Manhattan, Chelsea
12	7726	99 dollars	Brooklyn, Crown Heights
13	8024	130 dollars	Brooklyn, Park Slope
14	8025	80 dollars	Brooklyn, Park Slope
15	8110	110 dollars	Brooklyn, Park Slope
16	8490	120 dollars	Brooklyn, Bedford-Stuyvesant
17	8505	60 dollars	Brooklyn, Windsor Terrace
18	9518	44 dollars	Manhattan, Inwood
19	9657	180 dollars	Manhattan, East Village
20	9668	50 dollars	Manhattan, Harlem
21	9704	52 dollars	Manhattan, Harlem
22	9782	55 dollars	Brooklyn, Greenpoint
23	9783	50 dollars	Manhattan, Harlem
24	10452	70 dollars	Brooklyn, Bedford-Stuyvesant
25	10962	89 dollars	Brooklyn, South Slope
26	11708	85 dollars	Brooklyn, Bushwick
27	12048	40 dollars	Manhattan, Lower East Side
28	12192	68 dollars	Manhattan, East Village
29	12299	120 dollars	Brooklyn, South Slope
30	12318	135 dollars	Manhattan, Upper West Side
31	12343	150 dollars	Manhattan, Harlem
32	12627	150 dollars	Brooklyn, Prospect-Lefferts Gardens
33	12937	130 dollars	Queens, Long Island City
34	12940	110 dollars	Brooklyn, Bedford-Stuyvesant
35	13394	80 dollars	Brooklyn, Fort Greene
36	13808	80 dollars	Brooklyn, Bedford-Stuyvesant
37	14287	151 dollars	Manhattan, Upper West Side
38	14290	228 dollars	Brooklyn, Williamsburg
39	14314	144 dollars	Brooklyn, Greenpoint
40	14322	200 dollars	Manhattan, Kips Bay
41	14377	150 dollars	Brooklyn, Williamsburg
42	14991	110 dollars	Manhattan, Lower East Side
43	15220	69 dollars	Manhattan, Hell's Kitchen
44	15338	49 dollars	Brooklyn, Greenpoint
45	15341	180 dollars	Manhattan, SoHo
46	15385	80 dollars	Brooklyn, Williamsburg
47	15711	250 dollars	Manhattan, Upper East Side
48	16326	200 dollars	Brooklyn, Prospect Heights
49	16421	52 dollars	Manhattan, Hell's Kitchen

	listing_id	description	room_type
0	2595	Skylit Midtown Castle	Entire home/apt
1	3831	Cozy Entire Floor of Brownstone	Entire home/apt
2	5099	Large Cozy 1 BR Apartment In Midtown East	Entire home/apt
3	5178	Large Furnished Room Near B'way	private room
4	5238	Cute & Cozy Lower East Side 1 bdrm	Entire home/apt

	listing_id	host_name	last_review
0	2595	Jennifer	May 21 2019
1	3831	LisaRoxanne	July 05 2019
2	5099	Chris	June 22 2019
3	5178	Shunichi	June 24 2019
4	5238	Ben	June 09 2019
5	5295	Lena	June 22 2019
6	5441	Kate	June 23 2019
7	5803	Laurie	June 24 2019
8	6021	Claudio	July 05 2019
9	6848	Allen & Irina	June 29 2019
10	7097	Jane	June 28 2019
11	7322	Doti	July 01 2019
12	7726	Adam And Charity	June 22 2019
13	8024	Lisel	July 01 2019
14	8025	Lisel	January 01 2019
15	8110	Lisel	July 02 2019
16	8490	Nathalie	June 19 2019
17	8505	Gregory	June 23 2019
18	9518	Shon	June 15 2019
19	9657	Dana	April 19 2019
20	9668	Ssameer Or Trip	June 01 2019
21	9704	Teri	June 14 2019
22	9782	Andrea	June 15 2019
23	9783	Ssameer Or Trip	July 01 2019
24	10452	Angela	May 12 2019
25	10962	Laurie	June 21 2019
26	11708	Tyrome	June 22 2019
27	12048	Ben	July 05 2019
28	12192	Edward	June 21 2019
29	12299	Abdul	May 27 2019
30	12318	Cyn	June 16 2019
31	12343	Earl	June 13 2019
32	12627	Rana	June 05 2019
33	12937	Orestes	July 01 2019
34	12940	Adreinne	May 25 2019
35	13394	Alexander	June 17 2019
36	13808	JT And Tiziana	June 16 2019
37	14287	Joya	July 07 2019
38	14290	James	May 17 2019
39	14314	Jeanne	June 29 2019
40	14322	Francesca	March 25 2019
41	14377	Joanna	June 22 2019
42	14991	Bianca	May 18 2019
43	15220	Luiz	June 09 2019
44	15338	Andrea	June 04 2019
45	15341	Ted	June 29 2019
46	15385	Cristina	June 30 2019
47	15711	D	March 30 2019
48	16326	Dimitri	January 26 2019
49	16421	Mark	May 16 2019

	listing_id	price	nbhood_full
0	2595	225.0	Manhattan, Midtown
1	3831	89.0	Brooklyn, Clinton Hill
2	5099	200.0	Manhattan, Murray Hill
3	5178	79.0	Manhattan, Hell's Kitchen
4	5238	150.0	Manhattan, Chinatown

	listing_id	host_name	last_review
0	2595	Jennifer	2019-05-21
1	3831	LisaRoxanne	2019-07-05
2	5099	Chris	2019-06-22
3	5178	Shunichi	2019-06-24
4	5238	Ben	2019-06-09

Pillow Palooza¶

Data Wrangling and Cleaning¶

Importing the Data¶

Initial Exploration¶

Initial Observations¶

Prices dataframe¶

Price column¶

Avg Price column¶

Removing outliers¶

Avg Price per Month column¶

Room types dataframe¶

Room Type column¶

Reviews dataframe¶

Last review column¶

Host Name Column¶

Joining the dataframes¶

Analyzing listing prices by NYC borough¶

Price range by borough¶

Exporting the clean datasets¶

label	price
budget	\$0-69
average	\$70-175
expensive	\$176-350
extravagant	>\$350

	borough	price_range	labels	frequency
0	Bronx	$0-69	Budget	381
1	Bronx	$176-350	Expensive	25
2	Bronx	$70-175	Average	286
3	Bronx	>$350	Extravagant	5
4	Brooklyn	$0-69	Budget	3201
5	Brooklyn	$176-350	Expensive	1466
6	Brooklyn	$70-175	Average	5534
7	Brooklyn	>$350	Extravagant	259
8	Manhattan	$0-69	Budget	1150
9	Manhattan	$176-350	Expensive	3073
10	Manhattan	$70-175	Average	5289
11	Manhattan	>$350	Extravagant	810
12	Queens	$0-69	Budget	1632
13	Queens	$176-350	Expensive	291
14	Queens	$70-175	Average	1505
15	Queens	>$350	Extravagant	28
16	Staten Island	$0-69	Budget	124
17	Staten Island	$176-350	Expensive	20
18	Staten Island	$70-175	Average	123