Code
# 1️⃣ Load Data
import pandas as pd
= pd.read_csv('../data/raw/hotel_bookings.csv')
df
df.shape df.head()
5 rows × 32 columns
# 2️⃣ Data Summary
df.info()
sum()
df.isnull().
df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 hotel 119390 non-null object
1 is_canceled 119390 non-null int64
2 lead_time 119390 non-null int64
3 arrival_date_year 119390 non-null int64
4 arrival_date_month 119390 non-null object
5 arrival_date_week_number 119390 non-null int64
6 arrival_date_day_of_month 119390 non-null int64
7 stays_in_weekend_nights 119390 non-null int64
8 stays_in_week_nights 119390 non-null int64
9 adults 119390 non-null int64
10 children 119386 non-null float64
11 babies 119390 non-null int64
12 meal 119390 non-null object
13 country 118902 non-null object
14 market_segment 119390 non-null object
15 distribution_channel 119390 non-null object
16 is_repeated_guest 119390 non-null int64
17 previous_cancellations 119390 non-null int64
18 previous_bookings_not_canceled 119390 non-null int64
19 reserved_room_type 119390 non-null object
20 assigned_room_type 119390 non-null object
21 booking_changes 119390 non-null int64
22 deposit_type 119390 non-null object
23 agent 103050 non-null float64
24 company 6797 non-null float64
25 days_in_waiting_list 119390 non-null int64
26 customer_type 119390 non-null object
27 adr 119390 non-null float64
28 required_car_parking_spaces 119390 non-null int64
29 total_of_special_requests 119390 non-null int64
30 reservation_status 119390 non-null object
31 reservation_status_date 119390 non-null object
dtypes: float64(4), int64(16), object(12)
memory usage: 29.1+ MB
# 3️⃣ Feature Engineering
'stay_length'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
df['booking_month'] = pd.to_datetime(df['reservation_status_date']).dt.month
df['weekday'] = pd.to_datetime(df['reservation_status_date']).dt.weekday
df[
'stay_length', 'lead_time', 'booking_month', 'weekday']].head() df[[
# 4️⃣ Target Exploration - Cancellations
import seaborn as sns
import matplotlib.pyplot as plt
= df['is_canceled'].mean()
cancel_rate print(f"Cancellation Rate: {cancel_rate:.2%}")
=df, x='booking_month', hue='is_canceled')
sns.countplot(data'Cancellations by Month')
plt.title( plt.show()
Cancellation Rate: 37.04%
# 5️⃣ Price Analysis
'adr'], kde=True)
sns.histplot(df['Average Daily Rate (ADR) Distribution')
plt.title('ADR (Average Daily Rate)')
plt.xlabel(
plt.show()
=df, x='market_segment', y='adr')
sns.boxplot(data=45)
plt.xticks(rotation'ADR by Market Segment')
plt.title( plt.show()
# 6️⃣ Booking Patterns
'reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
df[= df.groupby('reservation_status_date').size()
bookings_by_date
=(12,6))
bookings_by_date.plot(figsize'Bookings Over Time')
plt.title('Date')
plt.xlabel('Number of Bookings')
plt.ylabel( plt.show()