Hotel Booking Demand Dataset

Hotel Booking Demand Dataset
- Dataset

see Hotel booking demand datasets, Kaggle

import numpy as np
import pandas as pd
import datetime

Dataset

hotel_bookings = pd.read_csv('datasets/hotel_bookings.csv')
hotel_bookings.head(5)

Size

# complete number of rows
print(hotel_bookings.index)
# RangeIndex(start=0, stop=119390, step=1)

# complete number of columns
print(len(hotel_bookings.columns))
# 32

Missing Data

# only show rows that have missing values
hotel_bookings_nan = hotel_bookings[hotel_bookings.isna().any(axis=1)]
hotel_bookings_nan
# 119173 rows × 32 columns
# only 119390 - 119173 =  217 rows don't have missing entries

	hotel	is_canceled	lead_time	arrival_date_year	arrival_date_month	arrival_date_week_number	arrival_date_day_of_month	stays_in_weekend_nights	stays_in_week_nights	adults	...	deposit_type	agent	company	days_in_waiting_list	customer_type	adr	required_car_parking_spaces	total_of_special_requests	reservation_status	reservation_status_date
0	Resort Hotel	0	342	2015	July	27	1	0	0	2	...	No Deposit	NaN	NaN	0	Transient	0.00	0	0	Check-Out	01-07-15
1	Resort Hotel	0	737	2015	July	27	1	0	0	2	...	No Deposit	NaN	NaN	0	Transient	0.00	0	0	Check-Out	01-07-15
2	Resort Hotel	0	7	2015	July	27	1	0	1	1	...	No Deposit	NaN	NaN	0	Transient	75.00	0	0	Check-Out	02-07-15
3	Resort Hotel	0	13	2015	July	27	1	0	1	1	...	No Deposit	304.0	NaN	0	Transient	75.00	0	0	Check-Out	02-07-15
4	Resort Hotel	0	14	2015	July	27	1	0	2	2	...	No Deposit	240.0	NaN	0	Transient	98.00	0	1	Check-Out	03-07-15
...
119385	City Hotel	0	23	2017	August	35	30	2	5	2	...	No Deposit	394.0	NaN	0	Transient	96.14	0	0	Check-Out	06-09-17
119386	City Hotel	0	102	2017	August	35	31	2	5	3	...	No Deposit	9.0	NaN	0	Transient	225.43	0	2	Check-Out	07-09-17
119387	City Hotel	0	34	2017	August	35	31	2	5	2	...	No Deposit	9.0	NaN	0	Transient	157.71	0	4	Check-Out	07-09-17
119388	City Hotel	0	109	2017	August	35	31	2	5	2	...	No Deposit	89.0	NaN	0	Transient	104.40	0	0	Check-Out	07-09-17
119389	City Hotel	0	205	2017	August	35	29	2	7	2	...	No Deposit	9.0	NaN	0	Transient	151.20	0	2	Check-Out	07-09-17

# which columns have the most missing entries
hotel_bookings.isna().sum()

# the columns company, agent and country have the most missing data:


hotel	0
is_canceled	0
lead_time	0
arrival_date_year	0
arrival_date_month	0
arrival_date_week_number	0
arrival_date_day_of_month	0
stays_in_weekend_nights	0
stays_in_week_nights	0
adults	0
children	4
babies	0
meal	0
country	488
market_segment	0
distribution_channel	0
is_repeated_guest	0
previous_cancellations	0
previous_bookings_not_canceled	0
reserved_room_type	0
assigned_room_type	0
booking_changes	0
deposit_type	0
agent	16340
company	112593
days_in_waiting_list	0
customer_type	0
adr	0
required_car_parking_spaces	0
total_of_special_requests	0
reservation_status	0
reservation_status_date	0
dtype: int64

# drop columns with missing data
hotel_bookings_dropped_nan = hotel_bookings.drop(['company', 'agent'], axis=1)
hotel_bookings_dropped_nan.head(2)

hotel_bookings_dropped_nan[hotel_bookings_dropped_nan.isna().any(axis=1)]
# 4 rows × 29 columns
# only the 4 rows with missing data in the children column and 488 country column remain

Exploration

Data Columns

# what columns do we have
pd.Series(hotel_bookings.columns)


0	hotel
1	is_canceled
2	lead_time
3	arrival_date_year
4	arrival_date_month
5	arrival_date_week_number
6	arrival_date_day_of_month
7	stays_in_weekend_nights
8	stays_in_week_nights
9	adults
10	children
11	babies
12	meal
13	country
14	market_segment
15	distribution_channel
16	is_repeated_guest
17	previous_cancellations
18	previous_bookings_not_canceled
19	reserved_room_type
20	assigned_room_type
21	booking_changes
22	deposit_type
23	agent
24	company
25	days_in_waiting_list
26	customer_type
27	adr
28	required_car_parking_spaces
29	total_of_special_requests
30	reservation_status
31	reservation_status_date
dtype: object

Top Countries

# top5 country codes
hotel_bookings_dropped_nan['country'].value_counts().head(5)


PRT	48590
GBR	12129
FRA	10415
ESP	8568
DEU	7287
Name: country, dtype: int64

hotel_bookings_dropped_nan['country'].value_counts().head(15).plot.bar(figsize=(12,4),rot=65)

Average Daily Rates

plot = hotel_bookings_dropped_nan.plot.scatter(
    figsize=(12,8),
    x='adr',
    y='hotel')

# there are only 2 hotels and all adr's are within 0-500$ with one outlier above 5000$

# find outlier
hotel_bookings_dropped_nan.sort_values('adr', ascending=False).iloc[0]


hotel	City Hotel
is_canceled	1
lead_time	35
arrival_date_year	2016
arrival_date_month	March
arrival_date_week_number	13
arrival_date_day_of_month	25
stays_in_weekend_nights	0
stays_in_week_nights	1
adults	2
children	0.0
babies	0
meal	BB
country	PRT
market_segment	Offline TA/TO
distribution_channel	TA/TO
is_repeated_guest	0
previous_cancellations	0
previous_bookings_not_canceled	0
reserved_room_type	A
assigned_room_type	A
booking_changes	1
deposit_type	Non Refund
days_in_waiting_list	0
customer_type	Transient
adr	5400.0
required_car_parking_spaces	0
total_of_special_requests	0
reservation_status	Canceled
reservation_status_date	19-02-16
Name: 48515, dtype: object

plot = hotel_bookings_dropped_nan.plot.hist(
    column=["adr"],
    by="hotel",
    bins=100,
    figsize=(10, 8)
)

# the outlier squeezes the first histogram and makes it hard to compare them

# let's find the outlier iloc and drop the row
hotel_bookings_dropped_nan['adr'].idxmax()
# 48515

hotel_bookings_dropped_outlier = hotel_bookings_dropped_nan.drop(48515, axis=0)

plot = hotel_bookings_dropped_outlier.plot.hist(
    column=["adr"],
    by="hotel",
    bins=100,
    figsize=(10, 8)
)

# nice :)

# calculate the average daily rate `adr` for a guest staying at each hotel
adr_by_hotel = hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True)['adr']
adr_by_hotel

Average Daily Rate


hotel
City Hotel	105.304465
Resort Hotel	94.952930
Name: adr, dtype: float64

Average Stays

# how long do guest stay on average
hotel_bookings_dropped_nan['total_days'] = hotel_bookings_dropped_nan['stays_in_weekend_nights'] + hotel_bookings_dropped_nan['stays_in_week_nights']
hotel_bookings_dropped_nan['total_days'].head(5)


0	0
1	0
2	1
3	1
4	2
Name: total_days, dtype: int64

average_stays = hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True).round(1)['total_days']
average_stays

# the average staying time is 3 and 4.3 days, respectively

hotel
City Hotel	3.0
Resort Hotel	4.3
_Name: total	days, dtype: float64

Average Cost per Stay

# given the # of days and average daily adr we can calculate the average total cost per stay
hotel_bookings_dropped_nan['total_cost'] = hotel_bookings_dropped_nan['total_days'] * hotel_bookings_dropped_nan['adr']
hotel_bookings_dropped_nan['total_cost'].head(5)


0	0.0
1	0.0
2	75.0
3	75.0
4	196.0
Name: total_days, dtype: float64

average_total_cost = hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True).round(2)['total_cost']
average_total_cost

hotel
City Hotel	318.66
Resort Hotel	435.45
Name: total_days, dtype: float64

Percentage of Returning Guest

# total number of bookings per hotel
hotel_bookings_dropped_nan.value_counts('hotel')

hotel
City Hotel	79330
Resort Hotel	40060
dtype: int64

# select only city hotel
city_hotel_bookings = hotel_bookings_dropped_nan[hotel_bookings_dropped_nan['hotel'] == 'City Hotel']
city_hotel_bookings['hotel'].head(5)


40060	City Hotel
40061	City Hotel
40062	City Hotel
40063	City Hotel
40064	City Hotel
Name: hotel, dtype: object

# select only resort hotel
resort_hotel_bookings = hotel_bookings_dropped_nan[hotel_bookings_dropped_nan['hotel'] == 'Resort Hotel']
resort_hotel_bookings['hotel'].head(5)


0	Resort Hotel
1	Resort Hotel
2	Resort Hotel
3	Resort Hotel
4	Resort Hotel
Name: hotel, dtype: object

returning_customer_city_hotel = sum(city_hotel_bookings['is_repeated_guest'] == 1)
returning_customer_city_hotel
# 2032

total_customer_city_hotel = hotel_bookings_dropped_nan.value_counts('hotel')['City Hotel']
total_customer_city_hotel
# 79330

percentage_returning_customer_city_hotel = (
    returning_customer_city_hotel * 100 / total_customer_city_hotel
)
percentage_returning_customer_city_hotel.round(2)
# 2.56%

returning_customer_resort_hotel = sum(resort_hotel_bookings['is_repeated_guest'] == 1)
total_customer_resort_hotel = hotel_bookings_dropped_nan.value_counts('hotel')['Resort Hotel']

percentage_returning_customer_resort_hotel = (
    returning_customer_resort_hotel * 100 / total_customer_resort_hotel
)
percentage_returning_customer_resort_hotel.round(2)
# 4.44%

# visualize
hotel_index = ['City Hotel', 'Resort Hotel']
booking_columns = ['Total Bookings', 'Returning Customer', 'Percentage']
data_array = [
    (
        total_customer_city_hotel,
        returning_customer_city_hotel,
        percentage_returning_customer_city_hotel.round(2)
    ),
    (
        total_customer_resort_hotel,
        returning_customer_resort_hotel,
        percentage_returning_customer_resort_hotel.round(2)
    )
]

return_customer_df = pd.DataFrame(data_array, hotel_index, booking_columns)
return_customer_df

Hotel	Total Bookings	Returning Customer	Percentage
City Hotel	79330	2032	2.56
Resort Hotel	40060	1778	5.07

plot = return_customer_df[
    ['Total Bookings', 'Returning Customer']
].plot.bar(figsize=(12,8), rot=0)

arrival_date_year 	2016
arrival_date_month 	March
arrival_date_day_of_month 	25
Aug 27, 1989
pd.to_datetime(date_series)

city_hotel_bookings[[
    'arrival_date_year',
    'arrival_date_month',
    'arrival_date_day_of_month'
]].head(5)

	arrival_date_year	arrival_date_month	arrival_date_day_of_month
40060	2015	July	1
40061	2015	July	1
40062	2015	July	1
40063	2015	July	1
40064	2015	July	2

Correlate Bookings to Day of the Week

city_hotel_bookings['datetime'] = (
    pd.to_datetime(
        city_hotel_bookings['arrival_date_month'] + ' ' + city_hotel_bookings['arrival_date_day_of_month'].astype(str) + ' , ' + city_hotel_bookings['arrival_date_year'].astype(str)
    )
)

city_hotel_bookings['datetime']


40060	2015-07-01
40061	2015-07-01
40062	2015-07-01
40063	2015-07-01
40064	2015-07-02
...
119385	2017-08-30
119386	2017-08-31
119387	2017-08-31
119388	2017-08-31
119389	2017-08-29
Name: datetime, Length: 79330, dtype: datetime64[ns]

# get weekday out of datetime object
city_hotel_bookings['datetime'].loc[40064].weekday()
# 3 == Thursday

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def weekday(day_of_the_week):
        return days[day_of_the_week]

city_hotel_bookings['day_of_the_week'] = np.vectorize(weekday)(
        city_hotel_bookings['datetime'].dt.weekday
)

city_hotel_bookings['day_of_the_week'].tail(5)


119385	Wednesday
119386	Thursday
119387	Thursday
119388	Thursday
119389	Tuesday
Name: day_of_the_week, dtype: object

bookings_by_weekday_city_hotel = city_hotel_bookings.value_counts('day_of_the_week')
bookings_by_weekday_city_hotel

day_of_the_week
Friday	13955
Thursday	13009
Monday	11823
Wednesday	11229
Saturday	10993
Sunday	9194
Tuesday	9127
dtype: int64

bookings_by_weekday_city_hotel.plot.bar(figsize=(12,8), rot=0)

resort_hotel_bookings['datetime'] = (
    pd.to_datetime(
        resort_hotel_bookings['arrival_date_month'] + ' ' + resort_hotel_bookings['arrival_date_day_of_month'].astype(str) + ' , ' + resort_hotel_bookings['arrival_date_year'].astype(str)
    )
)

resort_hotel_bookings['day_of_the_week'] = np.vectorize(weekday)(
        resort_hotel_bookings['datetime'].dt.weekday
)

bookings_by_weekday_resort_hotel = resort_hotel_bookings.value_counts('day_of_the_week')
bookings_by_weekday_resort_hotel

day_of_the_week
Saturday	7062
Monday	6348
Thursday	6245
Friday	5676
Sunday	4947
Wednesday	4910
Tuesday	4872
dtype: int64

bookings_by_weekday_resort_hotel.plot.bar(figsize=(12,8), rot=0)

Bookings within a Date Range

first_15 = hotel_bookings_dropped_nan['arrival_date_day_of_month'].apply(lambda day: day in range(1,16)).sum()
first_15
# 58152

last_15 = hotel_bookings_dropped_nan['arrival_date_day_of_month'].apply(lambda day: day in range(15,32)).sum()
last_15
# 65434

mpolinowski/hotel-booking-dataset

Hotel Booking Demand Dataset

Dataset

Size

Missing Data

Exploration

Data Columns

Top Countries

Average Daily Rates

Average Stays

Average Cost per Stay

Percentage of Returning Guest

Correlate Bookings to Day of the Week

Bookings within a Date Range