Homework 5: Complete

Connecticut Real Estate Sales Data

Joseph Farrell
2022-03-09
Load Libraries
Read in data
connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
Check variables of connecticut
str(connecticut)
spec_tbl_df [930,621 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Serial Number   : num [1:930621] 141466 140604 14340 140455 141195 ...
 $ List Year       : num [1:930621] 2014 2014 2014 2014 2014 ...
 $ Date Recorded   : chr [1:930621] "08/06/2015" "06/29/2015" "07/01/2015" "04/30/2015" ...
 $ Town            : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
 $ Address         : chr [1:930621] "83 OVERBROOK DRIVE" "56 HIGHVIEW LANE" "32 OVERLOOK DR" "171 BRADFORD WALK" ...
 $ Assessed Value  : num [1:930621] 503270 86030 351880 204680 229330 ...
 $ Sale Amount     : num [1:930621] 850000 149900 570000 261000 250000 ...
 $ Sales Ratio     : num [1:930621] 0.592 0.574 0.617 0.784 0.917 ...
 $ Property Type   : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
 $ Residential Type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
 $ Non Use Code    : chr [1:930621] NA NA NA NA ...
 $ Assessor Remarks: chr [1:930621] NA NA NA NA ...
 $ OPM remarks     : chr [1:930621] NA NA NA NA ...
 $ Location        : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
 - attr(*, "spec")=
  .. cols(
  ..   `Serial Number` = col_double(),
  ..   `List Year` = col_double(),
  ..   `Date Recorded` = col_character(),
  ..   Town = col_character(),
  ..   Address = col_character(),
  ..   `Assessed Value` = col_double(),
  ..   `Sale Amount` = col_double(),
  ..   `Sales Ratio` = col_double(),
  ..   `Property Type` = col_character(),
  ..   `Residential Type` = col_character(),
  ..   `Non Use Code` = col_character(),
  ..   `Assessor Remarks` = col_character(),
  ..   `OPM remarks` = col_character(),
  ..   Location = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
Remove columns Serial number, Non Use Code, Assessor Remarks, Sales Ratio, and OPM Remarks
connecticut <- select(connecticut, 
                      "List Year", 
                      "Date Recorded", 
                      "Town", 
                      "Assessed Value", 
                      "Sale Amount", 
                      "Property Type", 
                      "Residential Type", 
                      "Location")
head(connecticut) #view first 6 rows of `connecticut`
# A tibble: 6 × 8
  `List Year` `Date Recorded` Town      `Assessed Value` `Sale Amount`
        <dbl> <chr>           <chr>                <dbl>         <dbl>
1        2014 08/06/2015      Stamford            503270        850000
2        2014 06/29/2015      New Haven            86030        149900
3        2014 07/01/2015      Ridgefie…           351880        570000
4        2014 04/30/2015      New Brit…           204680        261000
5        2014 06/26/2015      Stamford            229330        250000
6        2014 06/29/2015      West Har…           356300        600000
# … with 3 more variables: `Property Type` <chr>,
#   `Residential Type` <chr>, Location <chr>
View counts of Property Type and Residential Type
connecticut %>%
  count(`Property Type`)
# A tibble: 18 × 2
   `Property Type`      n
   <chr>            <int>
 1 A                 2210
 2 Apartments        2922
 3 C                    3
 4 C/I/U            11913
 5 Commercial       19404
 6 Condo             9932
 7 Four Family        237
 8 Industrial        2551
 9 Public Utility      66
10 R               290287
11 Residential     494147
12 Single Family    40518
13 Three Family      1228
14 Two Family        2411
15 UA                  11
16 V                24479
17 Vacant Land      23414
18 <NA>              4888
connecticut %>%
  count(`Residential Type`)
# A tibble: 6 × 2
  `Residential Type`      n
  <chr>               <int>
1 Condo              105420
2 Four Family          2150
3 Single Family      401612
4 Three Family        12586
5 Two Family          26408
6 <NA>               382445
Remove Property Type Column
connecticut <- select(connecticut, 
                      "List Year", 
                      "Date Recorded", 
                      "Town", 
                      "Assessed Value", 
                      "Sale Amount",
                      "Residential Type", 
                      "Location")
Remove NAs from Residential Type
connecticut <- connecticut %>%
  na.omit(`Residential Type`)
connecticut %>%
  count(`Residential Type`)
# A tibble: 5 × 2
  `Residential Type`      n
  <chr>               <int>
1 Condo               76907
2 Four Family          1690
3 Single Family      303731
4 Three Family        10194
5 Two Family          21184
Rename columns without spaces and for clarity
connecticut <- rename(connecticut, 
                      "list_year" = "List Year", 
                      "sale_date" = "Date Recorded", 
                      "assessed_price" = "Assessed Value", 
                      "sale_price" = "Sale Amount",
                      "residential_type" = "Residential Type", 
                      "location" = "Location", 
                      "town" = "Town")

List and description of remaining variables

Potential Research Questions

Explore Data
head(connecticut)
# A tibble: 6 × 7
  list_year sale_date town  assessed_price sale_price residential_type
      <dbl> <chr>     <chr>          <dbl>      <dbl> <chr>           
1      2014 08/06/20… Stam…         503270     850000 Single Family   
2      2014 06/29/20… New …          86030     149900 Single Family   
3      2014 07/01/20… Ridg…         351880     570000 Single Family   
4      2014 04/30/20… New …         204680     261000 Condo           
5      2014 06/26/20… Stam…         229330     250000 Single Family   
6      2014 06/29/20… West…         356300     600000 Single Family   
# … with 1 more variable: location <chr>
tail(connecticut)
# A tibble: 6 × 7
  list_year sale_date town  assessed_price sale_price residential_type
      <dbl> <chr>     <chr>          <dbl>      <dbl> <chr>           
1      2014 03/31/20… Litc…         220250     310000 Single Family   
2      2014 02/19/20… Hamd…         220080     402500 Single Family   
3      2014 12/18/20… Danb…         248500     428000 Single Family   
4      2014 11/20/20… Naug…         175910     252500 Single Family   
5      2014 06/18/20… Madi…         409200     580000 Single Family   
6      2014 04/22/20… Ches…         379100     520000 Condo           
# … with 1 more variable: location <chr>
Convert sale_date from character to date and create new column year
connecticut$sale_date <- as_date(connecticut$sale_date, 
                                 format = "%m/%d/%Y")
connecticut <- connecticut %>%
  mutate(year = floor_date(sale_date, 
                      unit = "year"))
str(connecticut)
tibble [413,706 × 8] (S3: tbl_df/tbl/data.frame)
 $ list_year       : num [1:413706] 2014 2014 2014 2014 2014 ...
 $ sale_date       : Date[1:413706], format: "2015-08-06" ...
 $ town            : chr [1:413706] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
 $ assessed_price  : num [1:413706] 503270 86030 351880 204680 229330 ...
 $ sale_price      : num [1:413706] 850000 149900 570000 261000 250000 ...
 $ residential_type: chr [1:413706] "Single Family" "Single Family" "Single Family" "Condo" ...
 $ location        : chr [1:413706] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
 $ year            : Date[1:413706], format: "2015-01-01" ...
 - attr(*, "na.action")= 'omit' Named int [1:516915] 9 13 17 18 19 25 27 28 33 42 ...
  ..- attr(*, "names")= chr [1:516915] "9" "13" "17" "18" ...
Examine first and last 75 rows of connecticut
head(connecticut, n = 75)
# A tibble: 75 × 8
   list_year sale_date  town          assessed_price sale_price
       <dbl> <date>     <chr>                  <dbl>      <dbl>
 1      2014 2015-08-06 Stamford              503270     850000
 2      2014 2015-06-29 New Haven              86030     149900
 3      2014 2015-07-01 Ridgefield            351880     570000
 4      2014 2015-04-30 New Britain           204680     261000
 5      2014 2015-06-26 Stamford              229330     250000
 6      2014 2015-06-29 West Hartford         356300     600000
 7      2014 2014-10-17 Willington            236060     332900
 8      2014 2015-08-05 Trumbull              183900     240000
 9      2014 2014-12-02 Waterbury              75840      91000
10      2014 2015-07-01 Trumbull              178500     175000
# … with 65 more rows, and 3 more variables: residential_type <chr>,
#   location <chr>, year <date>
tail(connecticut, n = 75)
# A tibble: 75 × 8
   list_year sale_date  town          assessed_price sale_price
       <dbl> <date>     <chr>                  <dbl>      <dbl>
 1      2014 2015-06-02 Danbury               205900     382000
 2      2014 2014-12-23 Easton                314300     536000
 3      2014 2015-06-12 Colchester            210600     301800
 4      2014 2015-06-04 Madison               451000     549000
 5      2014 2014-10-15 Enfield               159720     121000
 6      2014 2015-07-21 Danbury               203300     305000
 7      2014 2015-07-07 Darien                677530     850000
 8      2014 2015-04-29 East Hartford         103980     170000
 9      2014 2015-06-12 Ellington             252780     378500
10      2014 2015-04-28 Hartford               12027      25090
# … with 65 more rows, and 3 more variables: residential_type <chr>,
#   location <chr>, year <date>
Get summary statistics of assessed_price and sale_price
summarize(connecticut, 
          mean_assessed_price = mean(assessed_price), 
          mean_sale_price = mean(sale_price),
          median_assessed_price = median(assessed_price),
          median_sale_price = median(sale_price),
          sd_assessed_price =sd(assessed_price),
          sd_sale_price = sd(sale_price))
# A tibble: 1 × 6
  mean_assessed_pri… mean_sale_price median_assessed… median_sale_pri…
               <dbl>           <dbl>            <dbl>            <dbl>
1            243300.         339122.           159900           229000
# … with 2 more variables: sd_assessed_price <dbl>,
#   sd_sale_price <dbl>
View favstats of assessed_price
favstats(~assessed_price, data = connecticut)
 min     Q1 median     Q3       max     mean      sd      n missing
   0 111440 159900 245980 881510000 243299.8 1457126 413706       0
View favstats of sale_price
options(scipen = 999) # remove scientific notation
favstats(~sale_price, data = connecticut)
 min     Q1 median     Q3       max     mean     sd      n missing
   0 148000 229000 358500 120000000 339121.6 608770 413706       0
Examine counts of residential_type
table(select(connecticut, residential_type))

        Condo   Four Family Single Family  Three Family    Two Family 
        76907          1690        303731         10194         21184 
connecticut %>%
  count(residential_type)
# A tibble: 5 × 2
  residential_type      n
  <chr>             <int>
1 Condo             76907
2 Four Family        1690
3 Single Family    303731
4 Three Family      10194
5 Two Family        21184
View proportions of residential_type
prop.table(table(select(connecticut, residential_type)))

        Condo   Four Family Single Family  Three Family    Two Family 
  0.185897715   0.004085027   0.734171126   0.024640687   0.051205445 
View counts of town variable
connecticut %>%
  count(town) %>%
  arrange(desc(n))
# A tibble: 169 × 2
   town              n
   <chr>         <int>
 1 Bridgeport    15673
 2 Waterbury     12509
 3 Stamford      11776
 4 Norwalk        9701
 5 Fairfield      8878
 6 Hamden         8746
 7 New Haven      8362
 8 West Hartford  8346
 9 Danbury        8120
10 Milford        7926
# … with 159 more rows
View proportions of town variable
options(scipen = 999) #remove scientific notation
prop.table(table(select(connecticut, town)))

   ***Unknown***          Andover          Ansonia          Ashford 
  0.000002417175   0.000807336611   0.004481443344   0.001053888510 
            Avon      Barkhamsted     Beacon Falls           Berlin 
  0.006492533345   0.000787999207   0.001899899929   0.007208017288 
         Bethany           Bethel        Bethlehem       Bloomfield 
  0.001629176275   0.007169342480   0.000872600349   0.006485281818 
          Bolton           Bozrah         Branford       Bridgeport 
  0.001522820554   0.000570453414   0.009086162637   0.037884391331 
     Bridgewater          Bristol       Brookfield         Brooklyn 
  0.000601876695   0.018525232895   0.006098533741   0.003014217826 
      Burlington           Canaan       Canterbury           Canton 
  0.003400965903   0.000352907620   0.001317360638   0.003676523908 
         Chaplin         Cheshire          Chester          Clinton 
  0.000563201887   0.008846862265   0.001123986599   0.005448313537 
      Colchester        Colebrook         Columbia         Cornwall 
  0.004508032274   0.000464097693   0.001588084292   0.000352907620 
        Coventry         Cromwell          Danbury           Darien 
  0.004237308620   0.005242853621   0.019627464915   0.008092703514 
      Deep River            Derby           Durham      East Granby 
  0.001588084292   0.003149579653   0.002296316708   0.002359163271 
     East Haddam     East Hampton    East Hartford       East Haven 
  0.001723446119   0.005564337960   0.017481013087   0.009939425582 
       East Lyme     East Windsor         Eastford           Easton 
  0.003604008644   0.002651641504   0.000345656094   0.002982794545 
       Ellington          Enfield            Essex        Fairfield 
  0.005312951710   0.012397693048   0.002032844580   0.021459683930 
      Farmington         Franklin      Glastonbury           Goshen 
  0.007210434463   0.000423005709   0.013415323926   0.001467225518 
          Granby        Greenwich         Griswold           Groton 
  0.003558082310   0.011732969790   0.003526659028   0.007019477600 
        Guilford           Haddam           Hamden          Hampton 
  0.007785722228   0.001051471335   0.021140616766   0.000493103798 
        Hartford         Hartland        Harwinton           Hebron 
  0.015513432244   0.000464097693   0.001580832765   0.001600170169 
            Kent        Killingly     Killingworth          Lebanon 
  0.001237593847   0.006581968838   0.002223801444   0.002337408691 
         Ledyard           Lisbon       Litchfield             Lyme 
  0.003113322021   0.001177164460   0.001950660614   0.000894354928 
         Madison       Manchester        Mansfield      Marlborough 
  0.006934876458   0.016025873446   0.004172044882   0.002117445722 
         Meriden       Middlebury      Middlefield       Middletown 
  0.016151566571   0.002542868607   0.000848428594   0.012533054875 
         Milford           Monroe        Montville           Morris 
  0.019158532871   0.006816434860   0.000246551899   0.000727569820 
       Naugatuck      New Britain       New Canaan    New Fairfield 
  0.009791977878   0.018435797402   0.008080617637   0.004203468163 
    New Hartford        New Haven       New London      New Milford 
  0.002368831972   0.020212421381   0.006569882960   0.008849279440 
       Newington          Newtown          Norfolk   North Branford 
  0.012201901834   0.004802927683   0.000570453414   0.002496942273 
    North Canaan      North Haven North Stonington          Norwalk 
  0.000795250734   0.007742213069   0.001421299183   0.023449019352 
         Norwich         Old Lyme     Old Saybrook           Orange 
  0.011479166365   0.003758707875   0.005008387599   0.003756290699 
          Oxford       Plainfield       Plainville         Plymouth 
  0.004039100231   0.002588794941   0.005264608200   0.000908857981 
        Portland          Preston         Prospect           Putnam 
  0.003241432322   0.001295606058   0.003019052177   0.003142328127 
         Redding       Ridgefield       Rocky Hill          Roxbury 
  0.002434095710   0.008293329079   0.005929331458   0.000783164856 
           Salem        Salisbury         Scotland          Seymour 
  0.001256931251   0.000727569820   0.000036257632   0.005387884150 
          Sharon          Shelton          Sherman         Simsbury 
  0.001000710650   0.012871459442   0.001358452621   0.006502202047 
          Somers    South Windsor        Southbury      Southington 
  0.002975543018   0.008409353502   0.008757426772   0.011111755691 
         Sprague         Stafford         Stamford         Sterling 
  0.000785582032   0.003623346048   0.028464658477   0.001041802633 
      Stonington        Stratford         Suffield        Thomaston 
  0.002774917453   0.017524522245   0.004435517010   0.002192378162 
        Thompson          Tolland       Torrington         Trumbull 
  0.001464808342   0.004128535723   0.013623201017   0.011111755691 
           Union           Vernon        Voluntown      Wallingford 
  0.000183705337   0.008672825630   0.000870183174   0.015779321547 
          Warren       Washington        Waterbury        Waterford 
  0.000505189676   0.001515569027   0.030236448106   0.005919662756 
       Watertown    West Hartford       West Haven        Westbrook 
  0.005320203236   0.020173746574   0.012922220127   0.002521114028 
          Weston         Westport     Wethersfield       Willington 
  0.003342953692   0.010894209898   0.008112040918   0.001097397669 
          Wilton       Winchester          Windham          Windsor 
  0.005233184919   0.003432389185   0.002146451828   0.008887954248 
   Windsor Locks          Wolcott       Woodbridge         Woodbury 
  0.004524952503   0.005001136072   0.002936868211   0.003052892634 
       Woodstock 
  0.002516279677 
View summary statistics of different residential_types
connecticut %>%
  group_by(residential_type) %>%
  summarize(mean_sale_price_type = mean(sale_price),
            median_sale_price_type = median(sale_price),
            sd_sale_price_type = sd(sale_price))
# A tibble: 5 × 4
  residential_type mean_sale_price_… median_sale_pri… sd_sale_price_t…
  <chr>                        <dbl>            <dbl>            <dbl>
1 Condo                      255460.           175000          612254.
2 Four Family                244174.           190000          382695.
3 Single Family              376150.           250000          631711.
4 Three Family               180494.           155000          165494.
5 Two Family                 195847.           163500          227566.
View distribution of sale price, check for outliers
connecticut %>%
  group_by(residential_type) %>%
  ggplot(aes(x = residential_type, 
             y = sale_price)) +
  geom_violin(notch = TRUE) +
  labs(title = "Distribution of Sale Price for Residential Types",
       x = "Property Type",
       y = "Sale Price") +
  theme_light()

Remove top and bottom 2.5% of sale_price
connecticut <- connecticut %>%
  filter(sale_price < quantile(connecticut$sale_price, .975) & 
           sale_price > quantile(connecticut$sale_price, .025)) 
View distribution after trimming the top and bottom 2.5%
connecticut %>%
  group_by(residential_type) %>%
  ggplot(aes(x = residential_type, 
             y = sale_price)) +
  geom_violin(aes(fill = residential_type), notch = TRUE) +
  labs(title = "Distribution of Sale Price for Residential Types",
       x = "Residential Type",
       y = "Sale Price") +
  theme_light()

Create histogram to get another view of distribution of sale_price
connecticut %>%
  ggplot(aes(x=sale_price, fill = residential_type)) +
    geom_histogram(binwidth = 100000) +
    labs(title = "Distribution of Sale Price",
         ylab = "Count",
         xlab = "Sale Price")

Counts of sale_price facet wrapped by residential_type
connecticut %>%
  ggplot(aes(x=sale_price, fill = residential_type)) +
    geom_histogram(binwidth = 200000) +
    labs(title = "Distribution of Sale Price: Residential Type",
         ylab = "Count",
         xlab = "Sale Price") +
    theme_bw() +
    facet_wrap(vars(residential_type), 
               scales = "free_y", 
                        "free_x")

View favstats of sale_price after being trimmed
favstats(~sale_price, data = connecticut)
   min     Q1 median     Q3     max     mean       sd      n missing
 35001 150000 229000 350000 1374000 283812.3 208105.4 392896       0
Create a bargraph of mean sale price of each town in Connecticut trimmed another 10%
connecticut %>%
  group_by(town, residential_type) %>%
  summarize(trimmed_mean_10 = mean(sale_price, trim = .1)) %>%
  ggplot(aes(x = town, y = trimmed_mean_10, fill = residential_type)) +
  geom_bar(stat = "identity") +
  labs(title = "Trimmed Mean (10%) of Sale Price of Towns in Connecticut", 
       y = "Price", 
       x = "Town") +
  theme_classic()

Group by year and display mean sale price
connecticut %>%
  group_by(year) %>%
  summarize(mean_year = mean(sale_price)) %>%
  arrange()
# A tibble: 18 × 2
   year       mean_year
   <date>         <dbl>
 1 2001-01-01   480542.
 2 2004-01-01   458225 
 3 2005-01-01   180333.
 4 2006-01-01   313608.
 5 2007-01-01   328287.
 6 2008-01-01   295805.
 7 2009-01-01   268327.
 8 2010-01-01   282171.
 9 2011-01-01   275290.
10 2012-01-01   287261.
11 2013-01-01   280918.
12 2014-01-01   270095.
13 2015-01-01   276110.
14 2016-01-01   270123.
15 2017-01-01   283867.
16 2018-01-01   274166.
17 2019-01-01   272908.
18 2020-01-01   307886.
Group by year and arrange descending by year
connecticut %>%
  group_by(sale_date, year) %>%
  arrange(sale_date) 
# A tibble: 392,896 × 8
# Groups:   sale_date, year [4,137]
   list_year sale_date  town          assessed_price sale_price
       <dbl> <date>     <chr>                  <dbl>      <dbl>
 1      2017 2001-08-01 Shelton               390810     635000
 2      2017 2001-08-23 Shelton               427910     558000
 3      2006 2001-09-05 Newington             166110     256000
 4      2006 2001-09-25 Naugatuck             183850     293711
 5      2006 2001-10-27 Greenwich             573930     660000
 6      2006 2004-07-27 South Windsor         113580     210000
 7      2006 2004-08-02 Stratford              92750     179000
 8      2006 2004-09-24 South Windsor         123520     243900
 9      2007 2004-11-02 Westport              869600    1200000
10      2007 2005-02-05 Meriden               155340     264000
# … with 392,886 more rows, and 3 more variables:
#   residential_type <chr>, location <chr>, year <date>
Filter out all sale dates prior to 2006
connecticut <-connecticut %>%
  filter(sale_date >= as.Date("2005-12-31"))

Creating Line Graph

Create an object of the towns with bottom five mean sale prices
mean_bottom_5 <-connecticut %>%
  group_by(town) %>%
  summarize(mean_bottom_5 = mean(sale_price)) %>%
  arrange(mean_bottom_5) %>%
  slice(1:5)
Create an object of the towns with the top five mean sale prices
mean_top_5 <-connecticut %>%
  group_by(town) %>%
  summarize(mean_top_5 = mean(sale_price)) %>%
  arrange(desc(mean_top_5)) %>%
  slice(1:5)
View top and bottom 10 towns
mean_bottom_5
# A tibble: 5 × 2
  town          mean_bottom_5
  <chr>                 <dbl>
1 Waterbury           114718.
2 Torrington          140221.
3 New Britain         140224.
4 Norwich             143596.
5 East Hartford       144837.
mean_top_5
# A tibble: 5 × 2
  town       mean_top_5
  <chr>           <dbl>
1 Darien        867241.
2 New Canaan    856889.
3 Westport      820937.
4 Greenwich     762582.
5 Weston        758676.
Create Vector of top 5 towns
top_5 <- mean_top_5$town
Create a line graph of the top ten towns with the highest median sale prices
connecticut %>%
  group_by(town, year) %>%
  summarize(mean_year = mean(sale_price)) %>%
  filter(town %in% top_5) %>%
  ggplot(aes(x = year, y = mean_year)) +
  geom_line(aes(color = town)) +
  ggtitle("Towns in Connecticut with the Highest Mean Sale Price: Top 5") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic()

Same as above, facet wrapped with geom_smooth
connecticut %>%
  group_by(town, year) %>%
  summarize(mean_year = mean(sale_price)) %>%
  filter(town %in% top_5) %>%
  ggplot(aes(x = year, y = mean_year)) +
  geom_line(aes(color = town)) +
  geom_smooth() +
  ggtitle("Towns With Highest Mean Sale Price: Top 5") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic() +
  facet_wrap(vars(town))

Scatterplot

Create scatterplot of mean_monthly_sale_price grouped by residential_type
connecticut %>%
  group_by(residential_type, month = lubridate::floor_date(sale_date, "month")) %>%
  summarize(mean_monthy_sale_price = mean(sale_price)) %>%
  ggplot(aes(x = month, y = mean_monthy_sale_price)) +
  geom_point((aes(color = residential_type))) +
  geom_smooth() +
  ggtitle("Mean Monthly Sale Price of Residential Types") +
  ylab("Mean Sale Price") +
  xlab("Time") +
  theme_linedraw()

Scatterplot of yearly mean sale price for each residential_type with confidence intervals and regression line
connecticut %>%
  group_by(residential_type, year) %>%
  summarize(mean_year_type = mean(sale_price)) %>%
  ggplot(aes(x = year, y = mean_year_type)) +
  geom_point((aes(color = residential_type))) +
  geom_smooth() +
  ggtitle("Yearly Mean Sale Price: Residential Types") +
  ylab("Mean Sale Price") +
  xlab("Time") +
  theme_linedraw() +
  facet_wrap(vars(residential_type))

Mapping

Read in map data and view Connecticut map
con_map <- map_data("county", 
         "connecticut")
ggplot() +
  geom_polygon(data = con_map,
               aes(x = long, y = lat, group = group),
               color = "black", fill = "slateblue1")

Read in data frame containing Connecticut town and county data
county_town <- read_csv(("/Users/nelsonfarrell/Documents/501 Stats/Connecticut County:Towns.csv"))
county_town <- county_town %>%
  select("subregion", "town")
Create object mapping: Joining connecticut and county_town data
mapping <- 
  left_join(connecticut, county_town, by = "town") %>%
  group_by(subregion) %>%
  summarize(county_mean = mean(sale_price))
mapping
# A tibble: 9 × 2
  subregion  county_mean
  <chr>            <dbl>
1 fairfield      419357.
2 hartford       235023.
3 litchfield     253131.
4 middlesex      276324.
5 new haven      237101.
6 new london     233944.
7 tolland        233742.
8 windham        189834.
9 <NA>           282450 
Join mapping object with con_map data
mapping <- inner_join(mapping, con_map, by = "subregion")
Create object to use when labeling the counties on the map
county_names <- aggregate(cbind(long, lat) ~ subregion, data=mapping, 
                    FUN=function(x)mean(range(x)))
Create map of Connecticut counties displaying the mean sale price of residential properties
ggplot(mapping, aes(x = long, y = lat)) +
  geom_polygon(aes(fill = county_mean, group = subregion)) +
  geom_text(data = county_names, aes(long, lat, label = subregion), size =3) +
  scale_fill_gradient(low = "bisque4", high = "bisque2") +
  theme_light()

Change in mean sale_price

Create of object of mean sale price for each town in 2006
mean_2006 <- connecticut %>%
  group_by(town, year) %>%
  select(year, 
         sale_price, 
         town) %>%
  summarize(mean_3 = mean(sale_price)) %>%
  filter(year == "2006-01-01")
Create of object of mean sale price for each town in 2018
mean_2018 <- connecticut %>%
  group_by(town, year) %>%
  select(year, 
         sale_price, 
         town) %>%
  summarize(mean_3 = mean(sale_price)) %>%
  filter(year == "2018-01-01")
View objects mean_2006 and mean_2018
mean_2006
# A tibble: 130 × 3
# Groups:   town [130]
   town        year        mean_3
   <chr>       <date>       <dbl>
 1 Ansonia     2006-01-01 240118.
 2 Ashford     2006-01-01 236024.
 3 Avon        2006-01-01 412245.
 4 Barkhamsted 2006-01-01 268300 
 5 Berlin      2006-01-01 271868.
 6 Bethany     2006-01-01 377326.
 7 Bethel      2006-01-01 340494.
 8 Bethlehem   2006-01-01 289000.
 9 Bloomfield  2006-01-01 305865.
10 Bolton      2006-01-01 286017.
# … with 120 more rows
mean_2018
# A tibble: 168 × 3
# Groups:   town [168]
   town         year        mean_3
   <chr>        <date>       <dbl>
 1 Andover      2018-01-01 236506.
 2 Ansonia      2018-01-01 197108.
 3 Ashford      2018-01-01 187244.
 4 Avon         2018-01-01 389546.
 5 Barkhamsted  2018-01-01 301730.
 6 Beacon Falls 2018-01-01 238719.
 7 Berlin       2018-01-01 254156.
 8 Bethany      2018-01-01 373936.
 9 Bethel       2018-01-01 325866.
10 Bethlehem    2018-01-01 311912.
# … with 158 more rows
Join mean_2006 and mean_2018 and create new column delta_mean and c_mean
combined <- mean_2006 %>%
  left_join(mean_2018, by = "town") %>%
  mutate(delta_mean = mean_3.y - mean_3.x) %>%
  mutate(c_mean = (mean_3.y - mean_3.x)/mean_3.x)
combined
# A tibble: 130 × 7
# Groups:   town [130]
   town    year.x     mean_3.x year.y     mean_3.y delta_mean   c_mean
   <chr>   <date>        <dbl> <date>        <dbl>      <dbl>    <dbl>
 1 Ansonia 2006-01-01  240118. 2018-01-01  197108.    -43010. -0.179  
 2 Ashford 2006-01-01  236024. 2018-01-01  187244.    -48780. -0.207  
 3 Avon    2006-01-01  412245. 2018-01-01  389546.    -22699. -0.0551 
 4 Barkha… 2006-01-01  268300  2018-01-01  301730.     33430.  0.125  
 5 Berlin  2006-01-01  271868. 2018-01-01  254156.    -17712. -0.0651 
 6 Bethany 2006-01-01  377326. 2018-01-01  373936.     -3390. -0.00898
 7 Bethel  2006-01-01  340494. 2018-01-01  325866.    -14628. -0.0430 
 8 Bethle… 2006-01-01  289000. 2018-01-01  311912.     22913.  0.0793 
 9 Bloomf… 2006-01-01  305865. 2018-01-01  213458.    -92406. -0.302  
10 Bolton  2006-01-01  286017. 2018-01-01  271425.    -14592. -0.0510 
# … with 120 more rows
Rename columns in combined
combined <- rename(combined, 
                   "mean_06" = "mean_3.x", 
                   "mean_18" = "mean_3.y") 
Remove unnecessary columns
combined <-
  select(combined,
         "mean_06",
         "mean_18",
         "delta_mean",
         "c_mean",
         "town")
Create object of the six towns with highest change in mean sale price (%)
delta_top_6 <- combined %>%
  arrange(desc(c_mean)) %>%
  filter(c_mean > .14)
delta_top_6
# A tibble: 6 × 5
# Groups:   town [6]
  mean_06 mean_18 delta_mean c_mean town     
    <dbl>   <dbl>      <dbl>  <dbl> <chr>    
1 216725  285550      68825   0.318 Franklin 
2 386750  481049.     94299.  0.244 Sharon   
3 436000  528995.     92995.  0.213 Salisbury
4 273850  324583.     50733.  0.185 Colebrook
5 248727. 289751.     41024.  0.165 Chester  
6 298083. 341824.     43740.  0.147 Morris   
Create vector of top 6
delta_top_6b <- delta_top_6$town
Display top 6
connecticut %>%
  group_by(town, year, residential_type) %>%
  summarise(mean5 = mean(sale_price)) %>%
  filter(town %in% delta_top_6b) %>%
  ggplot(aes(x = year, 
             y = mean5)) +
  geom_line(aes(color = town)) +
  geom_smooth() +
  facet_wrap(vars(town))

Status Summary

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Farrell (2022, March 12). Data Analytics and Computational Social Science: Homework 5: Complete. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211875890/

BibTeX citation

@misc{farrell2022homework,
  author = {Farrell, Joseph},
  title = {Data Analytics and Computational Social Science: Homework 5: Complete},
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211875890/},
  year = {2022}
}