#https://insideairbnb.com/bangkok/

Set the Directory, Loaded the data, and “tidyverse”

setwd("D:/me/R-Language/Practice/Dataset")

options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("tidyverse")
## Installing package into 'C:/Users/lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lenovo\AppData\Local\Temp\RtmpIZ47EY\downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the CSV file into a data frame
data <- read.csv("listings_airbnb_Aug2024.csv")

Check the data and Columns

# View the first few rows of the data
head(data)
##      id                                           name host_id  host_name
## 1 27934                Nice room with superb city view  120437     Nuttee
## 2 27979                 Easy going landlord,easy place  120541        Emy
## 3 28745              modern-style apartment in Bangkok  123784 Familyroom
## 4 35780 Spacious one bedroom at The Kris Condo Bldg. 3  153730    Sirilak
## 5 48736                Condo with Chaopraya River View  222005   Athitaya
## 6 55681                  Sathorn Terrace Apartment(61)  263049        Tor
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1                  NA   Ratchathewi 13.75983  100.5413 Entire home/apt  2020
## 2                  NA       Bang Na 13.66818  100.6167    Private room    NA
## 3                  NA     Bang Kapi 13.75232  100.6240    Private room    NA
## 4                  NA     Din Daeng 13.78823  100.5726    Private room  1286
## 5                  NA    Rat Burana 13.68556  100.4954    Private room  1653
## 6                  NA      Bang Rak 13.71934  100.5176    Private room  1150
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              3                64  2020-01-06              0.43
## 2              1                 0                            NA
## 3             60                 0                            NA
## 4             14                 6  2024-05-22              0.06
## 5              3                 1  2014-02-03              0.01
## 6              2                34  2024-04-17              0.21
##   calculated_host_listings_count availability_365 number_of_reviews_ltm license
## 1                              2              362                     0        
## 2                              2                0                     0        
## 3                              1                0                     0        
## 4                              1              309                     2        
## 5                              1              365                     0        
## 6                              7              356                     5
str(data)
## 'data.frame':    23651 obs. of  18 variables:
##  $ id                            : num  27934 27979 28745 35780 48736 ...
##  $ name                          : chr  "Nice room with superb city view" "Easy going landlord,easy place" "modern-style apartment in Bangkok" "Spacious one bedroom at The Kris Condo Bldg. 3" ...
##  $ host_id                       : int  120437 120541 123784 153730 222005 263049 263049 294896 302658 272478 ...
##  $ host_name                     : chr  "Nuttee" "Emy" "Familyroom" "Sirilak" ...
##  $ neighbourhood_group           : logi  NA NA NA NA NA NA ...
##  $ neighbourhood                 : chr  "Ratchathewi" "Bang Na" "Bang Kapi" "Din Daeng" ...
##  $ latitude                      : num  13.8 13.7 13.8 13.8 13.7 ...
##  $ longitude                     : num  101 101 101 101 100 ...
##  $ room_type                     : chr  "Entire home/apt" "Private room" "Private room" "Private room" ...
##  $ price                         : int  2020 NA NA 1286 1653 1150 1384 1102 NA 1543 ...
##  $ minimum_nights                : int  3 1 60 14 3 2 2 30 1 90 ...
##  $ number_of_reviews             : int  64 0 0 6 1 34 210 2 0 18 ...
##  $ last_review                   : chr  "2020-01-06" "" "" "2024-05-22" ...
##  $ reviews_per_month             : num  0.43 NA NA 0.06 0.01 0.21 1.29 0.01 NA 0.11 ...
##  $ calculated_host_listings_count: int  2 2 1 1 1 7 7 2 1 1 ...
##  $ availability_365              : int  362 0 0 309 365 356 365 362 0 358 ...
##  $ number_of_reviews_ltm         : int  0 0 0 2 0 5 2 0 0 0 ...
##  $ license                       : chr  "" "" "" "" ...
unique_room_type <- unique(data$room_type)
unique_room_type
## [1] "Entire home/apt" "Private room"    "Hotel room"      "Shared room"

Distribution of the Room Type

data %>%
  count(room_type) %>%
  ggplot(aes(x = reorder(room_type, -n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black", width = 0.7) +
  geom_text(aes(label = n), vjust = -0.5, color = "black", size = 3.5) +
  labs(title = "Distribution of Room Types", x = "Room Type", y = NULL) +
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))

Hosts with multiple listings

top_hosts <- data %>%
  group_by(host_name, room_type) %>%
  summarize(listings_count = n(), .groups = 'drop') %>%
  pivot_wider(
    names_from = room_type,
    values_from = listings_count,
    values_fill = list(listings_count = 0)  # Fill in 0 for missing values
  ) %>%
  mutate(
    `Listings` = `Entire home/apt` + `Private room` + `Shared room` + `Hotel room`
  ) %>% arrange(desc(Listings))

top_hosts
## # A tibble: 6,263 × 6
##    host_name `Entire home/apt` `Private room` `Hotel room` `Shared room`
##    <chr>                 <int>          <int>        <int>         <int>
##  1 Alice                    80            173            0             1
##  2 Curry                   225              0            0             1
##  3 Krittika                175             10            0             0
##  4 Elmer                     0            153            0             8
##  5 Alex                    151              6            1             0
##  6 Tony                    145              0            0             0
##  7 Noons                   123              0            0             2
##  8 Max                      96             10            0             0
##  9 Yang                    103              0            0             0
## 10 K                        87              3            8             0
## # ℹ 6,253 more rows
## # ℹ 1 more variable: Listings <int>

Availability Distribution for Short-Term vs Long-Term Rentals

short_term <- data %>% filter(minimum_nights <= 7)
long_term <- data %>% filter(minimum_nights > 7)


ggplot() +
  geom_histogram(data = short_term, aes(x = availability_365, fill = 'Short-Term'), bins = 30, alpha = 0.5) +
  geom_histogram(data = long_term, aes(x = availability_365, fill = 'Long-Term'), bins = 30, alpha = 0.5) +
  labs(title = "Availability Distribution for Short-Term vs Long-Term Rentals",
       x = "Availability (days per year)",
       y = "Count") +
  scale_fill_manual(values = c('Short-Term' = 'blue', 'Long-Term' = 'red')) +
  theme_minimal()

Mean, Median, and Avg_availability of short-term vs long-term rentals

summary_short_term <- short_term %>%
  summarise(average_price = mean(price, na.rm = TRUE),
            median_price = median(price, na.rm = TRUE),
            avg_availability = mean(availability_365, na.rm = TRUE))

summary_long_term <- long_term %>%
  summarise(average_price = mean(price, na.rm = TRUE),
            median_price = median(price, na.rm = TRUE),
            avg_availability = mean(availability_365, na.rm = TRUE))

summary_short_term
##   average_price median_price avg_availability
## 1      2644.593       1472.5         221.0468
summary_long_term
##   average_price median_price avg_availability
## 1      2074.552         1250         210.4231

Clean NA and infinite numbers

sum(!is.finite(data$price))
## [1] 4639
sum(is.na(data$price))
## [1] 4639
data_clean <- data %>%
  filter(is.finite(price) & !is.na(price))

str(data_clean)
## 'data.frame':    19012 obs. of  18 variables:
##  $ id                            : num  27934 35780 48736 55681 55686 ...
##  $ name                          : chr  "Nice room with superb city view" "Spacious one bedroom at The Kris Condo Bldg. 3" "Condo with Chaopraya River View" "Sathorn Terrace Apartment(61)" ...
##  $ host_id                       : int  120437 153730 222005 263049 263049 294896 272478 545890 578110 610315 ...
##  $ host_name                     : chr  "Nuttee" "Sirilak" "Athitaya" "Tor" ...
##  $ neighbourhood_group           : logi  NA NA NA NA NA NA ...
##  $ neighbourhood                 : chr  "Ratchathewi" "Din Daeng" "Rat Burana" "Bang Rak" ...
##  $ latitude                      : num  13.8 13.8 13.7 13.7 13.7 ...
##  $ longitude                     : num  101 101 100 101 101 ...
##  $ room_type                     : chr  "Entire home/apt" "Private room" "Private room" "Private room" ...
##  $ price                         : int  2020 1286 1653 1150 1384 1102 1543 6024 1469 1190 ...
##  $ minimum_nights                : int  3 14 3 2 2 30 90 28 30 1 ...
##  $ number_of_reviews             : int  64 6 1 34 210 2 18 147 0 6 ...
##  $ last_review                   : chr  "2020-01-06" "2024-05-22" "2014-02-03" "2024-04-17" ...
##  $ reviews_per_month             : num  0.43 0.06 0.01 0.21 1.29 0.01 0.11 0.95 NA 0.35 ...
##  $ calculated_host_listings_count: int  2 1 1 7 7 2 1 1 1 3 ...
##  $ availability_365              : int  362 309 365 356 365 362 358 362 365 365 ...
##  $ number_of_reviews_ltm         : int  0 2 0 5 2 0 0 0 0 5 ...
##  $ license                       : chr  "" "" "" "" ...

Boxplot of prices by room type

# Boxplot of prices by room type
data_clean %>% ggplot(aes(x = room_type, y = price)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Price by Room Type", x = "Room Type", y = "Price")

Boxplot of prices by room type without outliers

# Boxplot of prices by room type with y-axis limits
data_clean %>% ggplot(aes(x = room_type, y = price)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Price by Room Type", x = "Room Type", y = "Price") +
  coord_cartesian(ylim = c(0, 6000))


Scatter plot of price vs. number of reviews

# Scatter plot of price vs. number of reviews
data_clean %>%
  ggplot(aes(x = number_of_reviews, y = price)) +
  geom_point() +
  labs(title = "Price vs. Number of Reviews", x = "Number of Reviews", y = "Price")+ 
  coord_cartesian(ylim = c(0, 10000))

# Compute correlation
cor(data_clean$price, data_clean$number_of_reviews, use = "complete.obs")
## [1] -0.00500892

15 highest Listings by Neighborhood

# Count of listings by neighborhood and select top 15
neighborhood_counts <- data_clean %>%
  group_by(neighbourhood) %>%
  summarise(count = n()) %>%
  slice_max(order_by = count, n = 15)
neighborhood_counts
## # A tibble: 15 × 2
##    neighbourhood count
##    <chr>         <int>
##  1 Vadhana        3205
##  2 Khlong Toei    2870
##  3 Huai Khwang    1692
##  4 Ratchathewi    1250
##  5 Sathon          906
##  6 Phra Khanong    781
##  7 Phra Nakhon     735
##  8 Bang Rak        725
##  9 Chatu Chak      617
## 10 Parthum Wan     473
## 11 Din Daeng       470
## 12 Bang Na         454
## 13 Khlong San      452
## 14 Suanluang       444
## 15 Bang Phlat      363
# Bar plot of listing counts by neighborhood, arranged in descending order
neighborhood_counts %>% ggplot(aes(x = reorder(neighbourhood, -count), y = count)) +
  geom_bar(stat = "identity", fill = "lightcoral", color = "darkred") +
  labs(title = "15 highest Listings by Neighborhood", x = "Neighborhood", y = "Number of Listings") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

15 Highest Average Price by Neighborhood

# Average price by neighborhood
neighborhood_avg_price <- data_clean %>%
  group_by(neighbourhood) %>%
  summarise(avg_price = mean(price, na.rm = TRUE))%>%
  slice_max(order_by = avg_price, n = 15)

# Bar plot of average price by neighborhood
neighborhood_avg_price %>% ggplot(aes(x = reorder(neighbourhood, -avg_price), y = avg_price)) +
  geom_bar(stat = "identity", fill = "lightblue", color = "blue") +
  labs(title = "15 Highest Average Price by Neighborhood", x = "Neighborhood", y = "Average Price") +
  coord_cartesian(ylim = c(0, 10000)) +  # Adjust y-axis as needed
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Cleaning Data with sf package to create a geometry column using latitude and longitude

install.packages("sf")
## Installing package into 'C:/Users/lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependencies 'wk', 'classInt', 's2', 'units'
## package 'wk' successfully unpacked and MD5 sums checked
## package 'classInt' successfully unpacked and MD5 sums checked
## package 's2' successfully unpacked and MD5 sums checked
## package 'units' successfully unpacked and MD5 sums checked
## package 'sf' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lenovo\AppData\Local\Temp\RtmpIZ47EY\downloaded_packages
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
data_clean_sf <- st_as_sf(data_clean, coords = c("longitude", "latitude"), crs = 4326)
install.packages("plotly")
## Installing package into 'C:/Users/lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependencies 'later', 'htmlwidgets', 'lazyeval', 'crosstalk', 'promises'
## package 'later' successfully unpacked and MD5 sums checked
## package 'htmlwidgets' successfully unpacked and MD5 sums checked
## package 'lazyeval' successfully unpacked and MD5 sums checked
## package 'crosstalk' successfully unpacked and MD5 sums checked
## package 'promises' successfully unpacked and MD5 sums checked
## package 'plotly' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lenovo\AppData\Local\Temp\RtmpIZ47EY\downloaded_packages
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Geographical Distribution of Airbnb - Bangkok Listings along with their Price

# Creating the ggplot object
p <- ggplot(data_clean_sf) +
  geom_sf(aes(color = neighbourhood, text = paste("Neighbourhood:", neighbourhood, "<br>Price (THB):", price))) +
  theme_minimal() +
  labs(title = "Geographical Distribution of Airbnb - Bangkok Listings",
       subtitle = "Colored by Neighborhood",
       x = "Longitude",
       y = "Latitude") +
  theme(legend.position = "none")  # Remove the legend
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: text
# Converting to an interactive plot
p_interactive <- ggplotly(p, tooltip = "text")

# Displaying the interactive plot
p_interactive