Take Home Exercise 1

Data preparation

load necessary libraries

pacman::p_load(ggrepel,ggthemes,hrbrthemes,rstatix,dplyr, gt,visNetwork, patchwork, tidyverse,lubridate,kableExtra,ggridges,plotly,gganimate,viridis,ggplot2movies,readxl, gifski, gapminder)

Read the CSV files

Read the CSV file “FinancialJournal.csv” and store the data in the variable “financial”. Suppress display of column types. Remove duplicate rows using unique() Apply abs() to amount column to remove negative sign for spending.

financial<- read_csv("data/FinancialJournal.csv",show_col_types = FALSE)
financial <- unique(financial)
#glimpse(financial,width=NULL)
financial$amount <- abs(financial$amount )

financial <- financial %>% 
  mutate(
    year = year(timestamp),
    month = month(timestamp),
    weekdays = weekdays(timestamp),
    day = day(timestamp)
  )

Read the CSV file “Participants.csv” and store the data in the variable “participants”. Suppress display of column types. Remove duplicate rows using unique()

participants <- read_csv("data/Participants.csv",show_col_types = FALSE)
participants <- unique(participants)
#glimpse(participants,width=NULL)

Perform a left join on two tables

Perform a left join between the “financial” and “participants” data frames, matching rows by participantId. Preview of the “joined_data” as follow.

joined_data <- left_join(financial, participants, by = join_by(participantId   == participantId  ))
#glimpse(joined_data,width=NULL)

Check for missing data

Apply the function to calculate the sum of missing values (NA) for each column specified in the joined_data data frame. Number of missing value = 0 in all 10 columns. No missing value found.

Show the code

# Check for missing values using is.na()
missing_data <- is.na(joined_data)

# Check for missing values using is.na()
missing_data <- sapply(joined_data, function(x) sum(is.na(x)))

# Print the number of missing values per column
print(missing_data)

 participantId      timestamp       category         amount           year 
             0              0              0              0              0 
         month       weekdays            day  householdSize       haveKids 
             0              0              0              0              0 
           age educationLevel  interestGroup      joviality 
             0              0              0              0

Removing outliers

Removed Participants with with not enough data points in FinancialJournal. Filtered out these participantId from all the original tables. Plot a histogram of the outlier participantId.

Show the code

# Calculate count data
count_data <- joined_data %>%
  count(participantId, name = "count")

# Calculate quartiles and IQR from count_data
q1 <- quantile(count_data$count, 0.25)
q3 <- quantile(count_data$count, 0.75)
iqr <- q3 - q1

# Define outlier cutoff values
lower_cutoff <- q1 - 1.5 * iqr
upper_cutoff <- q3 + 1.5 * iqr

# Filter out outliers from joined_data
filtered_data <- joined_data %>%
  inner_join(count_data, by = "participantId") %>%
  filter(count >= lower_cutoff & count <= upper_cutoff)

# Filter out outliers from participants
filtered_participants <- participants %>%
  inner_join(count_data, by = "participantId") %>%
  filter(count >= lower_cutoff & count <= upper_cutoff)

# Filter out outliers from financial
filtered_financial <- financial %>%
  inner_join(count_data, by = "participantId") %>%
  filter(count >= lower_cutoff & count <= upper_cutoff)

# histogram plot of participantId sorted by count
# added lines for upper and lower Cutoff
ggplot(count_data, aes(x = reorder(participantId, count), y = count)) +
  geom_bar(stat = "identity", width = 0.4) +
  geom_hline(yintercept = lower_cutoff, linetype = "dotted", color = "red")+
  geom_hline(yintercept = upper_cutoff, linetype = "dotted", color = "red")+
  annotate("text", x = Inf, y = lower_cutoff, label = paste("Lower Cutoff:", lower_cutoff),
           vjust = -1, hjust = 1, color = "red") +
  annotate("text", x = Inf, y = upper_cutoff, label = paste("Upper Cutoff:", upper_cutoff),
           vjust = 1, hjust = 1, color = "red") +
  xlab("Participant ID") +
  ylab("Count") +
  ggtitle("Sorted Histogram on Count of records by Participant IDs ")

Transform using pviot_wider

transform financial data using pviot_wider by “category” Replace NA wil ‘0’ and added year, month, weekdays and date fields.

Show the code

transformed_financial<- filtered_financial %>% 
  pivot_wider(names_from = "category", 
              values_from = "amount",
              values_fn = list(amount = sum))

transformed_financial$count <- NULL
transformed_financial$Wage <- replace(transformed_financial$Wage, is.na(transformed_financial$Wage), 0)
transformed_financial$Shelter <- replace(transformed_financial$Shelter, is.na(transformed_financial$Shelter), 0)
transformed_financial$Education <- replace(transformed_financial$Education, is.na(transformed_financial$Education), 0)
transformed_financial$RentAdjustment <- replace(transformed_financial$RentAdjustment, is.na(transformed_financial$RentAdjustment), 0)
transformed_financial$Food <- replace(transformed_financial$Food, is.na(transformed_financial$Food), 0)
transformed_financial$Recreation <- replace(transformed_financial$Recreation, is.na(transformed_financial$Recreation), 0)

#transformed_financial <- transformed_financial %>% 
#  mutate(
#    year = year(timestamp),
#    month = month(timestamp),
#    weekdays = weekdays(timestamp),
#    date = date(timestamp)
#  )

head(transformed_financial, 5) %>%
  kbl(caption = "Sample table after transformed") %>%
  kable_classic_2(full_width = F)

Sample table after transformed
participantId	timestamp	year	month	weekdays	day	Wage	Shelter	Education
0	2022-03-01	2022	3	Tuesday	1	2472.508	554.9886	38.00538
1	2022-03-01	2022	3	Tuesday	1	2046.562	554.9886	38.00538
2	2022-03-01	2022	3	Tuesday	1	2436.915	556.5529	12.81245
3	2022-03-01	2022	3	Tuesday	1	2366.630	554.9886	38.00538
4	2022-03-01	2022	3	Tuesday	1	2456.687	1556.3562	12.81245

Visulization on the dataset

Distribution of participants data

Show the code

ggplot(data = filtered_participants) +
  geom_bar(mapping = aes(x = householdSize),color="grey25", fill="grey90")+
  theme_gray() +
  ggtitle("Distribution of householdSize")+
  theme_economist()

Show the code

ggplot(data = filtered_participants) +
  geom_bar(mapping = aes(x = haveKids),color="grey25", fill="grey90")+
  theme_gray() +
  ggtitle("Distribution of haveKids")+
  theme_economist()

Show the code

ggplot(data = filtered_participants) +
  geom_bar(mapping = aes(x = educationLevel),color="grey25", fill="grey90")+
  theme_gray() +
  ggtitle("Distribution of educationLevel")+
  theme_economist()

Show the code

ggplot(data = filtered_participants) +
  geom_bar(mapping = aes(x = interestGroup),color="grey25", fill="grey90")+
  theme_gray() +
  ggtitle("Distribution of interestGroup")+
  theme_economist()

Distribution of financial data

Overview distribution of a continuous variable across multiple categories.

Show the code

ggplot(filtered_financial, aes(x = amount,color = category, y = category)) + 
  ggridges::geom_density_ridges() +
  scale_x_log10(labels = scales::dollar)

Interactive distribution summary in ggplotly.

Show the code

# Create the ggplot object
p <- ggplot(filtered_financial, aes(x = amount, color = category, fill = category)) +
  geom_density(alpha = .15) +
  scale_x_log10(labels = scales::dollar)

# Convert ggplot to plotly
p <- ggplotly(p)

# Display the interactive plot
p

Wage and Spending across 12 month

Monthly wage and spending consistent across all 12 month except RentAdjustment.

Show the code

ggplot(data = filtered_financial, aes(x = month, y = category, fill = after_stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  theme_minimal() +
  theme(legend.position="none",
  axis.text = element_text(size = 8)) +
  scale_fill_viridis() +
  transition_time(filtered_financial$month) +
  ease_aes('linear')

Spending for Recreation is usually higher on weekends. Spending for Shelter is usually on 1st or 2nd of each month.

Show the code

p1 <- ggplot(transformed_financial, aes(x = weekdays, y = Recreation)) +
  geom_bar(stat = "identity", width = 0.5) +
  xlab("Weekdays") +
  ylab("Total Spending on Recreation") +
  ggtitle("Spending on Recreation by Weekday ")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p2 <- ggplot(transformed_financial, aes(x = day, y = Shelter)) +
  geom_bar(stat = "identity", width = 1) +
  xlab("Day") +
  ylab("Total Spending on Shelter") +
  ggtitle("Spending on Shelter by Day ")

p1+p2

InterestGroup vs categorical variables

HighSchoolOrCOllege are majority in all interest group.

Show the code

filtered_participants %>% 
  count(interestGroup,educationLevel) %>%  
  ggplot(mapping = aes(x = interestGroup, y = educationLevel)) +
    geom_tile(mapping = aes(fill = n))

Most don’t have kids.

Show the code

filtered_participants %>% 
  count(interestGroup, haveKids) %>%  
  ggplot(mapping = aes(x = interestGroup , y = haveKids )) +
    geom_tile(mapping = aes(fill = n))

And group D mostly have 3 people in their household

Show the code

filtered_participants %>% 
  count(interestGroup, householdSize) %>%  
  ggplot(mapping = aes(x = interestGroup , y = householdSize )) +
    geom_tile(mapping = aes(fill = n))

Take Home Exercise 1

Data preparation

load necessary libraries

Read the CSV files

Perform a left join on two tables

Check for missing data

Removing outliers

Transform using pviot_wider

Visulization on the dataset

Distribution of participants data

Distribution of financial data

Wage and Spending across 12 month

InterestGroup vs categorical variables

Reference:

Ref

Ref