# complete number of rowsprint(hotel_bookings.index)
# RangeIndex(start=0, stop=119390, step=1)# complete number of columnsprint(len(hotel_bookings.columns))
# 32
Missing Data
# only show rows that have missing valueshotel_bookings_nan=hotel_bookings[hotel_bookings.isna().any(axis=1)]
hotel_bookings_nan# 119173 rows × 32 columns# only 119390 - 119173 = 217 rows don't have missing entries
hotel
is_canceled
lead_time
arrival_date_year
arrival_date_month
arrival_date_week_number
arrival_date_day_of_month
stays_in_weekend_nights
stays_in_week_nights
adults
...
deposit_type
agent
company
days_in_waiting_list
customer_type
adr
required_car_parking_spaces
total_of_special_requests
reservation_status
reservation_status_date
0
Resort Hotel
0
342
2015
July
27
1
0
0
2
...
No Deposit
NaN
NaN
0
Transient
0.00
0
0
Check-Out
01-07-15
1
Resort Hotel
0
737
2015
July
27
1
0
0
2
...
No Deposit
NaN
NaN
0
Transient
0.00
0
0
Check-Out
01-07-15
2
Resort Hotel
0
7
2015
July
27
1
0
1
1
...
No Deposit
NaN
NaN
0
Transient
75.00
0
0
Check-Out
02-07-15
3
Resort Hotel
0
13
2015
July
27
1
0
1
1
...
No Deposit
304.0
NaN
0
Transient
75.00
0
0
Check-Out
02-07-15
4
Resort Hotel
0
14
2015
July
27
1
0
2
2
...
No Deposit
240.0
NaN
0
Transient
98.00
0
1
Check-Out
03-07-15
...
119385
City Hotel
0
23
2017
August
35
30
2
5
2
...
No Deposit
394.0
NaN
0
Transient
96.14
0
0
Check-Out
06-09-17
119386
City Hotel
0
102
2017
August
35
31
2
5
3
...
No Deposit
9.0
NaN
0
Transient
225.43
0
2
Check-Out
07-09-17
119387
City Hotel
0
34
2017
August
35
31
2
5
2
...
No Deposit
9.0
NaN
0
Transient
157.71
0
4
Check-Out
07-09-17
119388
City Hotel
0
109
2017
August
35
31
2
5
2
...
No Deposit
89.0
NaN
0
Transient
104.40
0
0
Check-Out
07-09-17
119389
City Hotel
0
205
2017
August
35
29
2
7
2
...
No Deposit
9.0
NaN
0
Transient
151.20
0
2
Check-Out
07-09-17
# which columns have the most missing entrieshotel_bookings.isna().sum()
# the columns company, agent and country have the most missing data:
hotel
0
is_canceled
0
lead_time
0
arrival_date_year
0
arrival_date_month
0
arrival_date_week_number
0
arrival_date_day_of_month
0
stays_in_weekend_nights
0
stays_in_week_nights
0
adults
0
children
4
babies
0
meal
0
country
488
market_segment
0
distribution_channel
0
is_repeated_guest
0
previous_cancellations
0
previous_bookings_not_canceled
0
reserved_room_type
0
assigned_room_type
0
booking_changes
0
deposit_type
0
agent
16340
company
112593
days_in_waiting_list
0
customer_type
0
adr
0
required_car_parking_spaces
0
total_of_special_requests
0
reservation_status
0
reservation_status_date
0
dtype: int64
# drop columns with missing datahotel_bookings_dropped_nan=hotel_bookings.drop(['company', 'agent'], axis=1)
hotel_bookings_dropped_nan.head(2)
hotel_bookings_dropped_nan[hotel_bookings_dropped_nan.isna().any(axis=1)]
# 4 rows × 29 columns# only the 4 rows with missing data in the children column and 488 country column remain
Exploration
Data Columns
# what columns do we havepd.Series(hotel_bookings.columns)
0
hotel
1
is_canceled
2
lead_time
3
arrival_date_year
4
arrival_date_month
5
arrival_date_week_number
6
arrival_date_day_of_month
7
stays_in_weekend_nights
8
stays_in_week_nights
9
adults
10
children
11
babies
12
meal
13
country
14
market_segment
15
distribution_channel
16
is_repeated_guest
17
previous_cancellations
18
previous_bookings_not_canceled
19
reserved_room_type
20
assigned_room_type
21
booking_changes
22
deposit_type
23
agent
24
company
25
days_in_waiting_list
26
customer_type
27
adr
28
required_car_parking_spaces
29
total_of_special_requests
30
reservation_status
31
reservation_status_date
dtype: object
Top Countries
# top5 country codeshotel_bookings_dropped_nan['country'].value_counts().head(5)
plot=hotel_bookings_dropped_nan.plot.scatter(
figsize=(12,8),
x='adr',
y='hotel')
# there are only 2 hotels and all adr's are within 0-500$ with one outlier above 5000$
plot=hotel_bookings_dropped_nan.plot.hist(
column=["adr"],
by="hotel",
bins=100,
figsize=(10, 8)
)
# the outlier squeezes the first histogram and makes it hard to compare them
# let's find the outlier iloc and drop the rowhotel_bookings_dropped_nan['adr'].idxmax()
# 48515
# calculate the average daily rate `adr` for a guest staying at each hoteladr_by_hotel=hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True)['adr']
adr_by_hotel
Average Daily Rate
hotel
City Hotel
105.304465
Resort Hotel
94.952930
Name: adr, dtype: float64
Average Stays
# how long do guest stay on averagehotel_bookings_dropped_nan['total_days'] =hotel_bookings_dropped_nan['stays_in_weekend_nights'] +hotel_bookings_dropped_nan['stays_in_week_nights']
hotel_bookings_dropped_nan['total_days'].head(5)
0
0
1
0
2
1
3
1
4
2
Name: total_days, dtype: int64
average_stays=hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True).round(1)['total_days']
average_stays# the average staying time is 3 and 4.3 days, respectively
hotel
City Hotel
3.0
Resort Hotel
4.3
_Name: total
days, dtype: float64
Average Cost per Stay
# given the # of days and average daily adr we can calculate the average total cost per stayhotel_bookings_dropped_nan['total_cost'] =hotel_bookings_dropped_nan['total_days'] *hotel_bookings_dropped_nan['adr']
hotel_bookings_dropped_nan['total_cost'].head(5)
# total number of bookings per hotelhotel_bookings_dropped_nan.value_counts('hotel')
hotel
City Hotel
79330
Resort Hotel
40060
dtype: int64
# select only city hotelcity_hotel_bookings=hotel_bookings_dropped_nan[hotel_bookings_dropped_nan['hotel'] =='City Hotel']
city_hotel_bookings['hotel'].head(5)
40060
City Hotel
40061
City Hotel
40062
City Hotel
40063
City Hotel
40064
City Hotel
Name: hotel, dtype: object
# select only resort hotelresort_hotel_bookings=hotel_bookings_dropped_nan[hotel_bookings_dropped_nan['hotel'] =='Resort Hotel']
resort_hotel_bookings['hotel'].head(5)