Load the necessary libraries

library(readr)
library(tidyr)
library(tidyverse)
library(dplyr)
library(janitor)
library(ggplot2)
library(lubridate)
library(skimr)
library(forcats)
library(scales)
library(mapview)
library(here)

here("202004-divvy-tripdata.csv")

## [1] "C:/Users/csoen/OneDrive/Documents/202004-divvy-tripdata.csv"

Load the data

trips0 <- read_csv("Google Data Analytics/course 7 R code/202004-divvy-tripdata.csv")

Check for NA values

trips1 = trips0 %>% 
  is.na() %>% 
  colSums()

Remove NA values and automatically clean column names

trips2 = trips0 %>% 
  remove_empty(which = c("rows","cols")) %>% 
  clean_names()

Change the latitude and longitude format from character to numeric type

trips2 = trips2 %>% 
  mutate(start_lat = as.numeric(start_lat),
         start_lng = as.numeric(start_lng),
         end_lat = as.numeric(end_lat),
         end_lng = as.numeric(end_lng))

Change format to date-time

trips3 = trips2 %>% 
  mutate(started_at = as_datetime(started_at))

Make new columns based on date-time information

trips4 = trips3 %>% 
  mutate(hour_start = hour(started_at),
         week = week(started_at),
         weekday = wday(started_at, label =TRUE, abbr = FALSE),
         day = day(started_at),
         trip_time = difftime(ended_at, started_at,units = "min"))

Change to factor data type and rename column for easier understanding

trips5 = trips4 %>% 
  mutate(rideable_type = as_factor(rideable_type),
         member_casual = as_factor(member_casual)) %>% 
  rename(bikes = rideable_type,
         users = member_casual)

Check for duplicate data

trips5 %>% 
  duplicated() %>% 
  sum()

## [1] 0

Filter unlogical trip time

trips6  = trips5 %>% 
  filter(between(trip_time,1,1440)) %>% 
  drop_na(end_lat, end_lng)

Select columns that contains date-time information

trips_time = trips6 %>% 
  select(ride_id,bikes,users,hour_start,week,weekday,day,trip_time)

Select columns that contains location information

trips_location =trips6 %>% 
  select(ride_id,bikes,users,start_station_name,start_lat,start_lng,end_station_name,start_station_id,
         end_lat,end_lng,end_station_id)

Data Visualization

Set plot theme

newtheme <- theme_light() + 
  theme(plot.title = element_text(color = "#002949", face = 'bold', size =12),
        plot.subtitle = element_text(color = "#890000", size = 10),
        plot.caption = element_text(color = '#890000', face = 'italic', size =8),
        panel.border = element_rect(color = "#002949", size = 1),
        legend.position = "right",
        legend.text = element_text(colour="blue", size=10, face="bold"),
        legend.title = element_text(colour="blue", size=10, face="bold"),
        #legend.position='none',
        axis.title.x = element_text(colour = "#890000"),
        axis.title.y = element_text(colour = "#002949"),
        axis.text.x = element_text(angle = 45, hjust = 1, color = '#890000'),
        axis.text.y = element_text(angle = 45, hjust = 1, color = '#002949'),
        axis.line = element_line(color = "#002949", size =1),
  )

theme_set(newtheme)

Hourly Data Visualization

ride_hours = trips_time %>% 
  group_by(users,hour_start) %>% 
  summarise(nr_rides = n(),
            mean_time = mean(trip_time),
            total_time = sum(trip_time))

## `summarise()` has grouped output by 'users'. You can override using the
## `.groups` argument.

Hourly number of rides

ggplot(data = ride_hours,aes(x=hour_start, y =nr_rides)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Number of Trips per Hour" ,subtitle = "Number of Trips for every Hours segmented by Users",
       caption = "Figure 1" ,
       x= "hour of the day",
       y= "number of rides")+
  theme()

Hourly Average trip time

ggplot(data = ride_hours,aes(x=hour_start, y =mean_time)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Average duration of Trips per Hour" ,subtitle = "Average duration of Trips for every Hours and segmented Users",
       caption = "Figure 2" ,
       x= "hour of the day",
       y= "Average duration of rides")+
  theme()

Hourly Total trip time

ggplot(data = ride_hours,aes(x=hour_start, y =total_time)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Total duration of Trips per Hour" ,subtitle = "Total duration of Trips for every Hours and segmented by Users",
       caption = "Figure 3" ,
       x= "hour of the day",
       y= "Total time of rides")+
  theme()

Day of the week visualization

ride_weekly = trips_time %>% 
  group_by(users,weekday) %>% 
  summarise(nr_weekly = n(),
            mean_week = mean(trip_time),
            total_week = sum(trip_time))

## `summarise()` has grouped output by 'users'. You can override using the
## `.groups` argument.

Day of the week number of rides

ggplot(data = ride_weekly,aes(x=weekday, y = nr_weekly)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Number of Trips per Week" ,subtitle = "Number of Trips for day of the week and segmented by Users",
       caption = "Figure 4" ,
       x= "Week",
       y= "Number of rides")+
  theme()

Day of the week Average trip time

ggplot(data = ride_weekly,aes(x=weekday, y = mean_week)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Average duration of Trips per Week" ,subtitle = "Average duration of Trips for day of the week and segmented by Users",
       caption = "Figure 5" ,
       x= "Week",
       y= "Average duration of rides")+
  theme()

Day of the week Total trip time

ggplot(data = ride_weekly,aes(x=weekday, y = total_week)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Total duration of Trips per Week" ,subtitle = "Total duration of Trips for day of the week and segmented by Users",
       caption = "Figure 6" ,
       x= "Week",
       y= "Total duration of rides")+
  theme()

Day of the month Visualization

ride_day = trips_time %>% 
  group_by(users,day) %>% 
  summarise(nr_day = n(),
            mean_day = mean(trip_time),
            total_day = sum(trip_time))

## `summarise()` has grouped output by 'users'. You can override using the
## `.groups` argument.

Day of the month number of rides

ggplot(data = ride_day,aes(x = day, y = nr_day)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Number of Trips per day of the month" ,subtitle = "Number of Trips for day of the month and segmented by Users",
       caption = "Figure 7" ,
       x= "day",
       y= "Number of rides")+
  theme()

Day of the month Average trip time

ggplot(data = ride_day,aes(x=day, y = mean_day)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Average duration of Trips per day of the month" ,subtitle = "Average duration of Trips for day of the month and segmented by Users",
       caption = "Figure 8" ,
       x= "day",
       y= "Average duration of rides")+
  theme()

Day of the month Total trip time

ggplot(data = ride_day,aes(x=day, y = total_day)) +
  geom_bar(aes(fill=users),position = "dodge",stat = "identity")+
  scale_y_continuous()+
  labs(title ="Total duration of Trips per day of the month" ,subtitle = "Total duration of Trips for day of the month and segmented by Users",
       caption = "Figure 9" ,
       x= "day",
       y= "Total duration of rides")+
  theme()

Visualize by location

pop_start_station = trips_location %>% 
  group_by(
    users, start_station_name, start_lat, start_lng
  ) %>% 
  summarise(
    nr_rides_start = n()
  ) %>% 
  arrange(-nr_rides_start)

## `summarise()` has grouped output by 'users', 'start_station_name', 'start_lat'.
## You can override using the `.groups` argument.

pop_end_station = trips_location %>% 
  group_by(
    users, end_station_name, end_lat, end_lng
  ) %>% 
  summarise(
    nr_rides_end = n()
  ) %>% 
  arrange(-nr_rides_end)

## `summarise()` has grouped output by 'users', 'end_station_name', 'end_lat'. You
## can override using the `.groups` argument.

Top 10 Start station

pop_start_station[1:10, ] %>% 
  ggplot(aes(start_station_name, nr_rides_start, fill = users))+
  geom_col(position = "dodge")+
  coord_flip()+
  labs(
    title = "Most Popular Start Stations",
    subtitle = "Top 10 most popular start stations",
    caption = "Fig 10 ",
    x = "station name",
    y = "number of trips"
  )+
  theme()

Top 10 end station

pop_end_station[1:10,] %>% 
  ggplot(aes(end_station_name, nr_rides_end, fill = users))+
  geom_col(position = "dodge")+
  coord_flip()+
  labs(
    title = "Most Popular End Stations Segmented by Users",
    subtitle = "Top 10 most popular end stations",
    caption = "Fig 11",
    x = "station name",
    y = "number of trips"
  )+
  theme()

Mapview of 30 most popular start station

pop_start_station[1:30, ] %>%
  mapview(
    xcol = "start_lng", 
    ycol = "start_lat",
    cex = "nr_rides_start",
    alpha = 0.9, 
    crs = 4269,
    color = "#8b0000",
    grid = F, 
    legend = T,
    layer.name = "30 Most Popular Start Stations")

Mapview of 30 most popular end station

pop_end_station[1:30,] %>% 
  mapview(
    xcol = "end_lng",
    ycol = "end_lat",
    cex = "nr_rides_end", # size of circle based on value size
    alpha = 0.9,
    crs = 4269,
    color = "#8b0000",
    grid = F,
    legend = T,
    layer.name = "30 Most Popular End Stations")

Google Data Analytics Capstone Project

Christian Soenggoro

2022-07-24

Load the necessary libraries

Load the data

Check for NA values

Remove NA values and automatically clean column names

Change the latitude and longitude format from character to numeric type

Change format to date-time

Make new columns based on date-time information

Change to factor data type and rename column for easier understanding

Check for duplicate data

Filter unlogical trip time

Select columns that contains date-time information

Select columns that contains location information

Data Visualization

Set plot theme

Hourly Data Visualization

Hourly number of rides

Hourly Average trip time

Hourly Total trip time

Day of the week visualization

Day of the week number of rides

Day of the week Average trip time

Day of the week Total trip time

Day of the month Visualization

Day of the month number of rides

Day of the month Average trip time

Day of the month Total trip time

Visualize by location

Top 10 Start station

Top 10 end station

Mapview of 30 most popular start station

Mapview of 30 most popular end station