/hotel-booking-dataset

Python Pandas Dataset Exploration with Hotel Demand Data.

Primary LanguageJupyter Notebook

Hotel Booking Demand Dataset

see Hotel booking demand datasets, Kaggle

import numpy as np
import pandas as pd
import datetime

Dataset

hotel_bookings = pd.read_csv('datasets/hotel_bookings.csv')
hotel_bookings.head(5)

Size

# complete number of rows
print(hotel_bookings.index)
# RangeIndex(start=0, stop=119390, step=1)

# complete number of columns
print(len(hotel_bookings.columns))
# 32

Missing Data

# only show rows that have missing values
hotel_bookings_nan = hotel_bookings[hotel_bookings.isna().any(axis=1)]
hotel_bookings_nan
# 119173 rows × 32 columns
# only 119390 - 119173 =  217 rows don't have missing entries
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.00 0 0 Check-Out 01-07-15
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.00 0 0 Check-Out 01-07-15
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.00 0 0 Check-Out 02-07-15
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.00 0 0 Check-Out 02-07-15
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.00 0 1 Check-Out 03-07-15
...
119385 City Hotel 0 23 2017 August 35 30 2 5 2 ... No Deposit 394.0 NaN 0 Transient 96.14 0 0 Check-Out 06-09-17
119386 City Hotel 0 102 2017 August 35 31 2 5 3 ... No Deposit 9.0 NaN 0 Transient 225.43 0 2 Check-Out 07-09-17
119387 City Hotel 0 34 2017 August 35 31 2 5 2 ... No Deposit 9.0 NaN 0 Transient 157.71 0 4 Check-Out 07-09-17
119388 City Hotel 0 109 2017 August 35 31 2 5 2 ... No Deposit 89.0 NaN 0 Transient 104.40 0 0 Check-Out 07-09-17
119389 City Hotel 0 205 2017 August 35 29 2 7 2 ... No Deposit 9.0 NaN 0 Transient 151.20 0 2 Check-Out 07-09-17
# which columns have the most missing entries
hotel_bookings.isna().sum()

# the columns company, agent and country have the most missing data:
hotel 0
is_canceled 0
lead_time 0
arrival_date_year 0
arrival_date_month 0
arrival_date_week_number 0
arrival_date_day_of_month 0
stays_in_weekend_nights 0
stays_in_week_nights 0
adults 0
children 4
babies 0
meal 0
country 488
market_segment 0
distribution_channel 0
is_repeated_guest 0
previous_cancellations 0
previous_bookings_not_canceled 0
reserved_room_type 0
assigned_room_type 0
booking_changes 0
deposit_type 0
agent 16340
company 112593
days_in_waiting_list 0
customer_type 0
adr 0
required_car_parking_spaces 0
total_of_special_requests 0
reservation_status 0
reservation_status_date 0
dtype: int64
# drop columns with missing data
hotel_bookings_dropped_nan = hotel_bookings.drop(['company', 'agent'], axis=1)
hotel_bookings_dropped_nan.head(2)
hotel_bookings_dropped_nan[hotel_bookings_dropped_nan.isna().any(axis=1)]
# 4 rows × 29 columns
# only the 4 rows with missing data in the children column and 488 country column remain

Exploration

Data Columns

# what columns do we have
pd.Series(hotel_bookings.columns)
0 hotel
1 is_canceled
2 lead_time
3 arrival_date_year
4 arrival_date_month
5 arrival_date_week_number
6 arrival_date_day_of_month
7 stays_in_weekend_nights
8 stays_in_week_nights
9 adults
10 children
11 babies
12 meal
13 country
14 market_segment
15 distribution_channel
16 is_repeated_guest
17 previous_cancellations
18 previous_bookings_not_canceled
19 reserved_room_type
20 assigned_room_type
21 booking_changes
22 deposit_type
23 agent
24 company
25 days_in_waiting_list
26 customer_type
27 adr
28 required_car_parking_spaces
29 total_of_special_requests
30 reservation_status
31 reservation_status_date
dtype: object

Top Countries

# top5 country codes
hotel_bookings_dropped_nan['country'].value_counts().head(5)
PRT 48590
GBR 12129
FRA 10415
ESP 8568
DEU 7287
Name: country, dtype: int64
hotel_bookings_dropped_nan['country'].value_counts().head(15).plot.bar(figsize=(12,4),rot=65)

Hotel Booking Demand Dataset

Average Daily Rates

plot = hotel_bookings_dropped_nan.plot.scatter(
    figsize=(12,8),
    x='adr',
    y='hotel')

# there are only 2 hotels and all adr's are within 0-500$ with one outlier above 5000$

Hotel Booking Demand Dataset

# find outlier
hotel_bookings_dropped_nan.sort_values('adr', ascending=False).iloc[0]
hotel City Hotel
is_canceled 1
lead_time 35
arrival_date_year 2016
arrival_date_month March
arrival_date_week_number 13
arrival_date_day_of_month 25
stays_in_weekend_nights 0
stays_in_week_nights 1
adults 2
children 0.0
babies 0
meal BB
country PRT
market_segment Offline TA/TO
distribution_channel TA/TO
is_repeated_guest 0
previous_cancellations 0
previous_bookings_not_canceled 0
reserved_room_type A
assigned_room_type A
booking_changes 1
deposit_type Non Refund
days_in_waiting_list 0
customer_type Transient
adr 5400.0
required_car_parking_spaces 0
total_of_special_requests 0
reservation_status Canceled
reservation_status_date 19-02-16
Name: 48515, dtype: object
plot = hotel_bookings_dropped_nan.plot.hist(
    column=["adr"],
    by="hotel",
    bins=100,
    figsize=(10, 8)
)

# the outlier squeezes the first histogram and makes it hard to compare them

Hotel Booking Demand Dataset

# let's find the outlier iloc and drop the row
hotel_bookings_dropped_nan['adr'].idxmax()
# 48515
hotel_bookings_dropped_outlier = hotel_bookings_dropped_nan.drop(48515, axis=0)

plot = hotel_bookings_dropped_outlier.plot.hist(
    column=["adr"],
    by="hotel",
    bins=100,
    figsize=(10, 8)
)

# nice :)

Hotel Booking Demand Dataset

# calculate the average daily rate `adr` for a guest staying at each hotel
adr_by_hotel = hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True)['adr']
adr_by_hotel

Average Daily Rate

hotel
City Hotel 105.304465
Resort Hotel 94.952930
Name: adr, dtype: float64

Average Stays

# how long do guest stay on average
hotel_bookings_dropped_nan['total_days'] = hotel_bookings_dropped_nan['stays_in_weekend_nights'] + hotel_bookings_dropped_nan['stays_in_week_nights']
hotel_bookings_dropped_nan['total_days'].head(5)
0 0
1 0
2 1
3 1
4 2
Name: total_days, dtype: int64
average_stays = hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True).round(1)['total_days']
average_stays

# the average staying time is 3 and 4.3 days, respectively
hotel
City Hotel 3.0
Resort Hotel 4.3
_Name: total days, dtype: float64

Average Cost per Stay

# given the # of days and average daily adr we can calculate the average total cost per stay
hotel_bookings_dropped_nan['total_cost'] = hotel_bookings_dropped_nan['total_days'] * hotel_bookings_dropped_nan['adr']
hotel_bookings_dropped_nan['total_cost'].head(5)
0 0.0
1 0.0
2 75.0
3 75.0
4 196.0
Name: total_days, dtype: float64
average_total_cost = hotel_bookings_dropped_nan.groupby('hotel').mean(numeric_only=True).round(2)['total_cost']
average_total_cost
hotel
City Hotel 318.66
Resort Hotel 435.45
Name: total_days, dtype: float64

Percentage of Returning Guest

# total number of bookings per hotel
hotel_bookings_dropped_nan.value_counts('hotel')
hotel
City Hotel 79330
Resort Hotel 40060
dtype: int64
# select only city hotel
city_hotel_bookings = hotel_bookings_dropped_nan[hotel_bookings_dropped_nan['hotel'] == 'City Hotel']
city_hotel_bookings['hotel'].head(5)
40060 City Hotel
40061 City Hotel
40062 City Hotel
40063 City Hotel
40064 City Hotel
Name: hotel, dtype: object
# select only resort hotel
resort_hotel_bookings = hotel_bookings_dropped_nan[hotel_bookings_dropped_nan['hotel'] == 'Resort Hotel']
resort_hotel_bookings['hotel'].head(5)
0 Resort Hotel
1 Resort Hotel
2 Resort Hotel
3 Resort Hotel
4 Resort Hotel
Name: hotel, dtype: object
returning_customer_city_hotel = sum(city_hotel_bookings['is_repeated_guest'] == 1)
returning_customer_city_hotel
# 2032
total_customer_city_hotel = hotel_bookings_dropped_nan.value_counts('hotel')['City Hotel']
total_customer_city_hotel
# 79330
percentage_returning_customer_city_hotel = (
    returning_customer_city_hotel * 100 / total_customer_city_hotel
)
percentage_returning_customer_city_hotel.round(2)
# 2.56%
returning_customer_resort_hotel = sum(resort_hotel_bookings['is_repeated_guest'] == 1)
total_customer_resort_hotel = hotel_bookings_dropped_nan.value_counts('hotel')['Resort Hotel']

percentage_returning_customer_resort_hotel = (
    returning_customer_resort_hotel * 100 / total_customer_resort_hotel
)
percentage_returning_customer_resort_hotel.round(2)
# 4.44%
# visualize
hotel_index = ['City Hotel', 'Resort Hotel']
booking_columns = ['Total Bookings', 'Returning Customer', 'Percentage']
data_array = [
    (
        total_customer_city_hotel,
        returning_customer_city_hotel,
        percentage_returning_customer_city_hotel.round(2)
    ),
    (
        total_customer_resort_hotel,
        returning_customer_resort_hotel,
        percentage_returning_customer_resort_hotel.round(2)
    )
]

return_customer_df = pd.DataFrame(data_array, hotel_index, booking_columns)
return_customer_df
Hotel Total Bookings Returning Customer Percentage
City Hotel 79330 2032 2.56
Resort Hotel 40060 1778 5.07
plot = return_customer_df[
    ['Total Bookings', 'Returning Customer']
].plot.bar(figsize=(12,8), rot=0)

Hotel Booking Demand Dataset

arrival_date_year 	2016
arrival_date_month 	March
arrival_date_day_of_month 	25
Aug 27, 1989
pd.to_datetime(date_series)
city_hotel_bookings[[
    'arrival_date_year',
    'arrival_date_month',
    'arrival_date_day_of_month'
]].head(5)
arrival_date_year arrival_date_month arrival_date_day_of_month
40060 2015 July 1
40061 2015 July 1
40062 2015 July 1
40063 2015 July 1
40064 2015 July 2

Correlate Bookings to Day of the Week

city_hotel_bookings['datetime'] = (
    pd.to_datetime(
        city_hotel_bookings['arrival_date_month'] + ' ' + city_hotel_bookings['arrival_date_day_of_month'].astype(str) + ' , ' + city_hotel_bookings['arrival_date_year'].astype(str)
    )
)

city_hotel_bookings['datetime']
40060 2015-07-01
40061 2015-07-01
40062 2015-07-01
40063 2015-07-01
40064 2015-07-02
...
119385 2017-08-30
119386 2017-08-31
119387 2017-08-31
119388 2017-08-31
119389 2017-08-29
Name: datetime, Length: 79330, dtype: datetime64[ns]
# get weekday out of datetime object
city_hotel_bookings['datetime'].loc[40064].weekday()
# 3 == Thursday
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def weekday(day_of_the_week):
        return days[day_of_the_week]
city_hotel_bookings['day_of_the_week'] = np.vectorize(weekday)(
        city_hotel_bookings['datetime'].dt.weekday
)

city_hotel_bookings['day_of_the_week'].tail(5)
119385 Wednesday
119386 Thursday
119387 Thursday
119388 Thursday
119389 Tuesday
Name: day_of_the_week, dtype: object
bookings_by_weekday_city_hotel = city_hotel_bookings.value_counts('day_of_the_week')
bookings_by_weekday_city_hotel
day_of_the_week
Friday 13955
Thursday 13009
Monday 11823
Wednesday 11229
Saturday 10993
Sunday 9194
Tuesday 9127
dtype: int64
bookings_by_weekday_city_hotel.plot.bar(figsize=(12,8), rot=0)

Hotel Booking Demand Dataset

resort_hotel_bookings['datetime'] = (
    pd.to_datetime(
        resort_hotel_bookings['arrival_date_month'] + ' ' + resort_hotel_bookings['arrival_date_day_of_month'].astype(str) + ' , ' + resort_hotel_bookings['arrival_date_year'].astype(str)
    )
)

resort_hotel_bookings['day_of_the_week'] = np.vectorize(weekday)(
        resort_hotel_bookings['datetime'].dt.weekday
)

bookings_by_weekday_resort_hotel = resort_hotel_bookings.value_counts('day_of_the_week')
bookings_by_weekday_resort_hotel
day_of_the_week
Saturday 7062
Monday 6348
Thursday 6245
Friday 5676
Sunday 4947
Wednesday 4910
Tuesday 4872
dtype: int64
bookings_by_weekday_resort_hotel.plot.bar(figsize=(12,8), rot=0)

Hotel Booking Demand Dataset

Bookings within a Date Range

first_15 = hotel_bookings_dropped_nan['arrival_date_day_of_month'].apply(lambda day: day in range(1,16)).sum()
first_15
# 58152
last_15 = hotel_bookings_dropped_nan['arrival_date_day_of_month'].apply(lambda day: day in range(15,32)).sum()
last_15
# 65434