Data Analytics and Computational Social Science: Challenge

Joseph Farrell

Load Libraries

hide

library(readr)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(lubridate)
library(plotly)
library(gapminder)
library(gganimate)
library(gifski)
library(png)

Read in and clean data:

This is just data from final project that I cut and pasted in

hide

connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
county_town <- read_csv(("/Users/nelsonfarrell/Documents/501 Stats/Connecticut County:Towns.csv"))
county_town <- county_town %>%
  select("subregion", "town")

connecticut <- rename(connecticut, "town" = "Town")

connecticut <- 
  left_join(connecticut, county_town, by = "town")

connecticut <- select(connecticut, 
                      "List Year", 
                      "Date Recorded", 
                      "town", 
                      "Assessed Value", 
                      "Sale Amount",
                      "Residential Type",
                      "subregion")

connecticut <- connecticut %>%
  na.omit(`Residential Type`)

connecticut <- rename(connecticut, 
                      "list_year" = "List Year", 
                      "sale_date" = "Date Recorded", 
                      "assessed_price" = "Assessed Value", 
                      "sale_price" = "Sale Amount",
                      "property_type" = "Residential Type", 
                      "county" = "subregion")

connecticut$sale_date <- as_date(connecticut$sale_date, 
                                 format = "%m/%d/%Y")
connecticut <- connecticut %>%
  mutate(sale_year = year(sale_date))

connecticut <- connecticut %>%
  filter(sale_year > 2007)

connecticut <- connecticut %>%
  filter(sale_price < quantile(connecticut$sale_price, .975) & 
           sale_price > quantile(connecticut$sale_price, .025))

Gganimate: Geom_line:

hide

meh <-connecticut %>%
  group_by(year = lubridate::floor_date(sale_date, "year"), 
           property_type) %>%
  summarize(mean = mean(sale_price))

meh5 <-ggplot(meh, 
              aes(year, 
                  mean, 
                  color = property_type)) +
  geom_line() +
  transition_reveal(year) +
  labs(title = "Mean Sale Price",
                x = "Year",
                y = "Price ($)") +
  guides(color = guide_legend(title = "Property Type"))

 
animate(meh5)

Gganimate: Geom_point:

hide

meh2 <-ggplot(meh, 
              aes(year, 
                  mean, 
                  color = property_type)) +
  geom_point() +
  transition_time(year) +
  labs(title = "Mean Sale Price",
                x = "Year",
                y = "Price ($)") +
  guides(color = guide_legend(title = "Property Type"))

 
animate(meh2)

Plotly

hide

plotly <- connecticut %>%
  group_by(property_type, 
           month = lubridate::floor_date(sale_date, "month")) %>%
  summarize(mean_monthy_sale_price = mean(sale_price)) %>%
  ggplot(aes(x = month, 
             y = mean_monthy_sale_price)) +
  geom_point((aes(color = property_type))) +
  geom_smooth() +
  ggtitle("Mean Monthly Sale Price of Property Types") +
  ylab("Mean Sale Price ($)") +
  xlab("Year") +
  theme_linedraw() +
  guides(color = guide_legend(title = "Property Type"))

ggplotly(plotly)

Functions

This function finds the mean sale price of the groups entered

It’s only kinda useful, it will find the mean sale of price of counties and towns depending on what’s entered

hide

mean_sale_price <- function(df, ...) {
  df %>%
    group_by(...) %>%
    dplyr::summarize(mean = mean(sale_price))
}

connecticut %>%
  mean_sale_price(town, sale_year)

# A tibble: 2,184 × 3
# Groups:   town [169]
   town    sale_year    mean
   <chr>       <dbl>   <dbl>
 1 Andover      2008 258944.
 2 Andover      2009 240323.
 3 Andover      2010 227498.
 4 Andover      2011 241943.
 5 Andover      2012 235318.
 6 Andover      2013 233322.
 7 Andover      2014 219865 
 8 Andover      2015 210757.
 9 Andover      2016 249672.
10 Andover      2017 226692.
# … with 2,174 more rows

Function to find the delta mean in ($) and in (%) of the entire column “town”

This function finds the change in mean sale price in ($) and in (%).
It’s not really useful because it is essentially just the code I wrote turned into a function.

hide

delta_mean_sale_price <- function(df, ...) {
  a <- df %>%
    group_by(...) %>%
    dplyr::summarize(mean_a = mean(sale_price)) %>%
    filter(sale_year == "2008")
  
  b <- df %>%
    group_by(...) %>%
    dplyr::summarize(mean_b = mean(sale_price)) %>%
    filter(sale_year == "2020")
 
  a %>%
  left_join(b, by = "town") %>%
  mutate(delta_mean_town = mean_b - mean_a) %>%
  mutate(delta_mean_percent = (delta_mean_town/mean_a)*100) %>%
  mutate(across(contains("mean"), round, 2)) %>%
  select("town",
         "delta_mean_town",
         "delta_mean_percent")
}

connecticut %>%
  delta_mean_sale_price(town, sale_year)

# A tibble: 169 × 3
# Groups:   town [169]
   town         delta_mean_town delta_mean_percent
   <chr>                  <dbl>              <dbl>
 1 Andover               -3282.              -1.27
 2 Ansonia               -5165.              -2.39
 3 Ashford                -548.              -0.27
 4 Avon                 -82496.             -18.6 
 5 Barkhamsted           -5822.              -2.06
 6 Beacon Falls         -41526.             -12.2 
 7 Berlin                 7320.               2.56
 8 Bethany              -78237.             -17.9 
 9 Bethel                58070.              19.2 
10 Bethlehem            -45199.             -12.7 
# … with 159 more rows

Function to find the delta mean in ($) of specific town or county

This is probably my most useful function and it’s still not really useful. This function returns the change in mean sale price of the town or county entered.
My goal was to write a function that only required the input of the “town” or “county” name, but this was the best I could do.
Filter() was challenging, rename() was challenging, etc…lots of challenges with these functions!

hide

test <- function(df, col_name, value) {
  a <- df %>%
    filter({{col_name}} == {{value}} & sale_year == "2008") %>%
    summarise(town_mean_08 = mean(sale_price)) 
  b <- df %>%
    filter({{col_name}} == {{value}} & sale_year == "2020") %>%
    summarise(delta_mean = mean(sale_price)) 
  b - a 
}

 test(connecticut, county, "fairfield")

  delta_mean
1   49997.28

Function to return the towns with the five highest change in mean sale price

This function is again not really useful. It only does one thing. I couldn’t figure out how to get left_join() or select() to use my “…” input.

hide

max_mean <- function(df, ...) {
  a <- df %>%
    group_by(...) %>%
    dplyr::summarize(mean_a = mean(sale_price)) %>%
    filter(sale_year == "2008")
  
  b <- df %>%
    group_by(...) %>%
    dplyr::summarize(mean_b = mean(sale_price)) %>%
    filter(sale_year == "2020")
 
  a %>%
  left_join(b, by = "town") %>%
  mutate(delta_mean_town = mean_b - mean_a) %>%
  mutate(percent_change_mean = (delta_mean_town/mean_a)*100) %>%
  mutate(across(contains("mean"), 
                         round, 
                         2)) %>%
  select("town",
         "delta_mean_town",
         "percent_change_mean") %>%
   arrange(desc(delta_mean_town)) %>%
   head(n = 5)
}

connecticut %>%
  max_mean(town, sale_year)

# A tibble: 5 × 3
# Groups:   town [5]
  town       delta_mean_town percent_change_mean
  <chr>                <dbl>               <dbl>
1 Goshen             256402.               117. 
2 Cornwall           188251.                54.0
3 Salisbury          157199.                32.5
4 Washington         155603.                32.3
5 Warren             123385                 31.4

Comment on this article Share:

Challenge

Load Libraries

Read in and clean data:

Gganimate: Geom_line:

Gganimate: Geom_point:

Plotly

Functions

Reuse

Citation