Homework 6: Incomplete

Connecticut Real Estate Sales Data

Joseph Farrell
2022-03-30
Load Libraries

Read in Data

Read in data
connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
county_town <- read_csv(("/Users/nelsonfarrell/Documents/501 Stats/Connecticut County:Towns.csv"))
county_town <- county_town %>%
  select("subregion", "town")

Data Wrangling

Rename “Town” variable “town” to join datasets
connecticut <- rename(connecticut, "town" = "Town")
Join datasets
connecticut <- 
  left_join(connecticut, county_town, by = "town")
Remove columns “Serial number”, “Non Use Code”, “Assessor Remarks”, “Sales Ratio”, “Property Type”, “Location”, and “OPM Remarks”
connecticut <- select(connecticut, 
                      "List Year", 
                      "Date Recorded", 
                      "town", 
                      "Assessed Value", 
                      "Sale Amount",
                      "Residential Type",
                      "subregion")
Remove NAs from “Residential Type”
connecticut <- connecticut %>%
  na.omit(`Residential Type`)
connecticut %>%
  count(`Residential Type`) %>%
  rename("Number of Properties Sold" = "n") %>%
  formattable(align = c("l","r"), list(`Number of Properties Sold` = color_bar("lightblue")))
Residential Type Number of Properties Sold
Condo 105420
Four Family 2150
Single Family 401611
Three Family 12586
Two Family 26408
Rename columns without spaces and for clarity
connecticut <- rename(connecticut, 
                      "list_year" = "List Year", 
                      "sale_date" = "Date Recorded", 
                      "assessed_price" = "Assessed Value", 
                      "sale_price" = "Sale Amount",
                      "residential_type" = "Residential Type", 
                      "county" = "subregion")
Convert “sale_date” from character to date and create new column “sale_year”
connecticut$sale_date <- as_date(connecticut$sale_date, 
                                 format = "%m/%d/%Y")
connecticut <- connecticut %>%
  mutate(sale_year = year(sale_date))
Remove all observations prior to 2007
connecticut <- connecticut %>%
  filter(sale_year > 2007)
Remove top and bottom 2.5% of “sale_price”
connecticut <- connecticut %>%
  filter(sale_price < quantile(connecticut$sale_price, .975) & 
           sale_price > quantile(connecticut$sale_price, .025)) 
Create a line graph overall mean sale price of residential properties in Connecticut
connecticut %>%
  group_by(year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(mean_year = mean(sale_price)) %>%
  ggplot(aes(x = year, y = mean_year)) +
  geom_line(aes()) +
  geom_smooth() +
  ggtitle("Overall Mean Sale Price of Connecticut Properties: 2008-2020") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_linedraw()

Create scatterplot of “mean_monthly_sale_price” grouped by “residential_type”
options(scipen = 999) # remove scientific notation
connecticut %>%
  group_by(residential_type, month = lubridate::floor_date(sale_date, "month")) %>%
  summarize(mean_monthy_sale_price = mean(sale_price)) %>%
  ggplot(aes(x = month, y = mean_monthy_sale_price)) +
  geom_point((aes(color = residential_type))) +
  geom_smooth() +
  ggtitle("Mean Monthly Sale Price of Residential Types") +
  ylab("Mean Sale Price ($)") +
  xlab("Year") +
  theme_linedraw() +
  scale_color_brewer(palette = "Dark2")

Scatterplot of yearly mean sale price for each “residential_type” with confidence intervals and regression line
options(scipen = 999) # remove scientific notation
connecticut %>%
  group_by(residential_type, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(mean_year_type = mean(sale_price)) %>%
  ggplot(aes(x = year, y = mean_year_type, group = residential_type)) +
  geom_point((aes(color = residential_type))) +
  geom_smooth() +
  ggtitle("Yearly Mean Sale Price: Residential Types") +
  ylab("Mean Sale Price ($)") +
  xlab("Year") +
  theme_linedraw() +
  facet_wrap(vars(residential_type))

Map Visualization

Read in map data
con_map <- map_data("county", 
         "connecticut")
Rename “subregion” “county” to join “con_map” with “connecticut”
con_map <- rename(con_map,
                  "county" = "subregion")
Create object map_vis: grouping by county and creating a “county_mean” column
map_vis <- connecticut %>%
  group_by(county) %>%
  summarize(county_mean = mean(sale_price))
Join mapping object with con_map data
map_vis <- inner_join(map_vis, con_map, by = "county")
Create object to use when labeling the counties on the map
county_names <- aggregate(cbind(long, lat) ~ county, data=map_vis, 
                    FUN=function(x)mean(range(x)))
Create map of Connecticut counties displaying the mean sale price of residential properties
ggplot(map_vis, aes(x = long, y = lat)) +
  geom_polygon(aes(fill = county_mean, group = county)) +
  geom_text(data = county_names, aes(long, lat, label = county), size =3) +
  scale_fill_gradient(low = "bisque4", high = "bisque2", name = "Average Sale Price (USD)") +
  ggtitle("Overall Average Sale Price in Connecticut Counties") +
  theme_light()

Change in mean “sale_price” of “Single Family” properties

Create of object of mean sale price of Single Family properties for each county in 2008
msf_2008 <- connecticut %>%
  group_by(county, 
           residential_type, 
           sale_year) %>%
  select(county,
         residential_type,
         sale_year, 
         sale_price) %>%
  summarize(mean_3 = mean(sale_price)) %>%
  filter(sale_year == "2008" &
         residential_type == "Single Family")
Create of object of mean sale price of Single Family properties for each town in 2020
msf_2020 <- connecticut %>%
  group_by(county, 
           residential_type, 
           sale_year) %>%
  select(sale_year,
         residential_type,
         sale_price,
         county) %>%
  summarize(mean_3 = mean(sale_price)) %>%
  filter(sale_year == "2020" &
         residential_type == "Single Family")
Join mean_2008 and mean_2020 and create new column delta_mean and c_mean
msf_08_20 <- msf_2008 %>%
  left_join(msf_2020, by = "county") %>%
  mutate(delta_mean = mean_3.y - mean_3.x) %>%
  mutate(c_mean = (mean_3.y - mean_3.x)/mean_3.x) %>%
  mutate(c_mean = c_mean * 100) %>%
  mutate(across(contains("mean"), round, 2)) %>%
  rename("mean_08" = "mean_3.x", 
         "mean_20" = "mean_3.y") %>%
  select("county",
         "mean_08",
         "mean_20",
         "delta_mean",
         "c_mean")
Create a table of the change in mean sale price of “Single Family” properties in the eight Connecticut counties
msf_08_20 %>%
  rename("County" = "county",
         "Mean 2008" = "mean_08",
         "Mean 2020" = "mean_20",
         "Change in Mean ($)" = "delta_mean",
         "Change in Mean (%)" = "c_mean") %>%
  formattable(list(`Mean 2020` = FALSE,
                   `Mean 2008` = FALSE,
                   `Change in Mean (%)` = formatter("span", 
                 style = ~ style(color = ifelse(`Change in Mean (%)` < 0, 
                                                "firebrick", 
                                                "forestgreen")),
                 ~ icontext(sapply(`Change in Mean (%)`, 
                                   function(x) if (x < 0) "arrow-down" else if (x > 0) "arrow-up"), 
                            `Change in Mean (%)`))))
County Change in Mean ($) Change in Mean (%)
fairfield 63048.12 12.56
hartford 13893.09 5.21
litchfield 60529.46 20.61
middlesex -13165.68 -3.97
new haven 110.93 0.04
new london 1967.19 0.72
tolland -6082.70 -2.29
windham 24476.58 11.89
Create Bargraph of the change in mean sale price from 2008 to 2020
msf_08_20 %>% 
  mutate(Color = ifelse(c_mean < 0, "rosybrown","steelblue")) %>%
  ggplot(aes(x = reorder(county, +c_mean), 
             y = c_mean, 
             fill = Color)) +
  geom_col()+
  geom_text(aes(label = c_mean, 
                vjust = 1.2),
                size = 3) +
  labs(title = "Percentage Change in Mean Sale Price for Single Family Properties: 2008 to 2020",
                x = "County Name",
                y = "Percent Change") +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1)) +
  scale_fill_identity(guide = "none")

Create line graph of mean sale price of single family properties in each county from 2008 to 2020
connecticut %>%
  group_by(county, residential_type, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(county_mean = mean(sale_price)) %>%
  filter(residential_type == "Single Family") %>%
  ggplot(aes(x = year, y = county_mean)) +
  geom_line(aes(color = county)) +
  ggtitle("Mean Sale Price of Single Family Properties: Connecticut Counties") +
  geom_smooth() +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic()

Create line graph of the mean sale price of single family properties in different counties
connecticut %>%
  group_by(county, 
           residential_type, 
           year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(mean_year = mean(sale_price)) %>%
  filter(residential_type == "Single Family") %>%
  ggplot(aes(x = year, 
             y = mean_year)) +
  geom_line(aes(color = county)) +
  ggtitle("Mean Sale Price of Single Family Properties: Connecticut Counties") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_bw() +
  facet_wrap(vars(county)) +
  guides(color = "none")

Diving deeper into Litchfield County Sale Prices

Create graphic of mean yearly sale price for just Litchfield County, group by property type
connecticut %>%
  filter(county == "litchfield") %>%
  group_by(residential_type, 
           year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(mean_litchfield = mean(sale_price)) %>%
  ggplot(aes(x = year, 
             y = mean_litchfield)) +
  geom_line(aes(color = residential_type)) +
  geom_smooth() +
  theme_bw() +
  labs(title = "Mean Sale Price of Different Residential Types",
       subtitle = "Litchfield County 2008-2020", 
       x = "Year", 
       y = "Mean Sale Price ($)") +
  facet_wrap(vars(residential_type)) +
  guides(color = "none")

Examine counts of residential type in Litchfield County
Litchfield <- connecticut %>%
  group_by(sale_year) %>%
  filter(county == "litchfield")
Litchfield %>%
  group_by(residential_type) %>%
  count(residential_type) %>%
  rename("Residential Type" = "residential_type", 
         "Count: Properties Sold" = "n") %>%
  knitr::kable(caption = "Counts of Residential Types: Litchfield County") %>%
  kable_classic(full_width = T) %>%
  row_spec(2, background = "yellow") %>%
  row_spec(4, background = "yellow") 
Table 1: Counts of Residential Types: Litchfield County
Residential Type Count: Properties Sold
Condo 3611
Four Family 61
Single Family 22891
Three Family 327
Two Family 1177
Display counts of different residential types over the 12 years 2008-2020
Litchfield %>%
  group_by(sale_year, residential_type) %>%
  ggplot(aes(x = sale_year, fill = residential_type)) +
  geom_bar(aes(stat="identity")) + 
  facet_wrap(vars(residential_type)) +
  labs(title = "Number of Properties Sold Each Year in Litchfield: 2008-2020",
       subtitle = "Displayed by Property Type",
       x = "Year 2008-2020",
       y = "Numnber of Properties") +
  theme(legend.position = "none")

Litchfield %>%
  filter(residential_type == "Four Family") %>%
  group_by(sale_year) %>%
  count(residential_type) %>%
  rename("Year" = "sale_year",
         "Residential Type" = "residential_type",
         "Total Sold" = "n") %>%
  knitr::kable(caption = "Counts of Four Family Properties Sold Each Year") %>%
  kable_classic(full_width = F, position = "left") %>%
  row_spec(4, background = "yellow") 
Table 2: Counts of Four Family Properties Sold Each Year
Year Residential Type Total Sold
2008 Four Family 3
2009 Four Family 2
2010 Four Family 4
2011 Four Family 1
2012 Four Family 3
2013 Four Family 3
2014 Four Family 5
2015 Four Family 3
2016 Four Family 5
2017 Four Family 4
2018 Four Family 10
2019 Four Family 10
2020 Four Family 8
Filter Litchfield County Object for only “Single Family” properties
Litchfield_SF <- Litchfield %>%
  filter(residential_type == "Single Family")
Display the mean sale price of “Single Family” properties in Litchfeild County towns: 2008-2020
Litchfield %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(meanL = mean(sale_price)) %>%
  ggplot(aes(x = year, y = meanL)) +
  geom_line(aes(color = town)) +
  facet_wrap(vars(town)) +
  guides(color = "none")

L08 <- Litchfield_SF %>%
  group_by(town, sale_year) %>%
  summarise(mean_L08 = mean(sale_price)) %>%
  filter(sale_year == "2008")

L20 <- Litchfield_SF %>%
  group_by(town, sale_year) %>%
  summarise(mean_L20 = mean(sale_price)) %>%
  filter(sale_year == "2020")

L_combined <- L08 %>%
  left_join(L20, by = "town")

L_combined <- L_combined %>%
  mutate(L_delta_mean = mean_L20 - mean_L08) %>%
  mutate(L_c_mean = (mean_L20 - mean_L08)/mean_L08) %>%
  mutate(L_c_mean = L_c_mean * 100) %>%
  mutate(across(contains("mean"), round, 2))
L_combined %>% 
  mutate(Color = ifelse(L_c_mean < 0, "rosybrown","steelblue")) %>%
  ggplot(aes(x = reorder(town, +L_c_mean), 
             y = L_c_mean, 
             fill = Color)) +
  geom_col()+
  geom_text(aes(label = L_c_mean, 
                vjust = 1.2),
                size = 3) +
  labs(title = "Percentage Change in Mean Sale Price: 2008 to 2020",
                x = "County Name",
                y = "Percent Change") +
  theme(axis.text.x = element_text(angle = 45, 
                                            hjust = 1)) +
  scale_fill_identity(guide = "none")

Litchfield %>%
  count(town)
# A tibble: 337 × 3
# Groups:   sale_year [13]
   sale_year town            n
       <dbl> <chr>       <int>
 1      2008 Barkhamsted    27
 2      2008 Bethlehem      22
 3      2008 Bridgewater    12
 4      2008 Canaan          5
 5      2008 Colebrook      18
 6      2008 Cornwall       22
 7      2008 Goshen          9
 8      2008 Harwinton      32
 9      2008 Kent           33
10      2008 Litchfield     71
# … with 327 more rows

Time Series Analysis: Arima

Create a time series object using singe family properties and weekly mean
ts_object <- connecticut %>%
  group_by(residential_type, week = lubridate::floor_date(sale_date, "week")) %>%
  summarise(mean_L = mean(sale_price)) %>%
  filter(residential_type == "Single Family") 
Create time series, predict future price behavior
ts_object <- xts(ts_object$mean_L, ts_object$week)

plot(ts_object)
pred <- auto.arima(ts_object)

forcast1 <- forecast(pred, 10)

print(forcast1)
     Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
4663       371356.5 351916.6 390796.4 341625.8 401087.2
4670       367049.0 346515.0 387583.0 335645.0 398453.1
4677       361690.1 340159.0 383221.2 328761.2 394619.0
4684       356143.5 333425.6 378861.3 321399.5 390887.4
4691       350616.8 326593.5 374640.1 313876.3 387357.3
4698       345227.6 319872.5 370582.7 306450.4 384004.8
4705       340062.3 313421.0 366703.6 299317.9 380806.7
4712       335184.9 307350.6 363019.1 292616.1 377753.6
4719       330640.4 301734.3 359546.4 286432.4 374848.4
4726       326458.0 296613.8 356302.2 280815.3 372100.7
plot(forcast1)

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Farrell (2022, April 3). Data Analytics and Computational Social Science: Homework 6: Incomplete. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211884388/

BibTeX citation

@misc{farrell2022homework,
  author = {Farrell, Joseph},
  title = {Data Analytics and Computational Social Science: Homework 6: Incomplete},
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211884388/},
  year = {2022}
}