Homework 5: Incomplete

Connecticut Estate Sales Data

Joseph Farrell
2022-03-02
Load Libraries
Read in data
connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
Check variables of connecticut
str(connecticut)
spec_tbl_df [930,621 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Serial Number   : num [1:930621] 141466 140604 14340 140455 141195 ...
 $ List Year       : num [1:930621] 2014 2014 2014 2014 2014 ...
 $ Date Recorded   : chr [1:930621] "08/06/2015" "06/29/2015" "07/01/2015" "04/30/2015" ...
 $ Town            : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
 $ Address         : chr [1:930621] "83 OVERBROOK DRIVE" "56 HIGHVIEW LANE" "32 OVERLOOK DR" "171 BRADFORD WALK" ...
 $ Assessed Value  : num [1:930621] 503270 86030 351880 204680 229330 ...
 $ Sale Amount     : num [1:930621] 850000 149900 570000 261000 250000 ...
 $ Sales Ratio     : num [1:930621] 0.592 0.574 0.617 0.784 0.917 ...
 $ Property Type   : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
 $ Residential Type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
 $ Non Use Code    : chr [1:930621] NA NA NA NA ...
 $ Assessor Remarks: chr [1:930621] NA NA NA NA ...
 $ OPM remarks     : chr [1:930621] NA NA NA NA ...
 $ Location        : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
 - attr(*, "spec")=
  .. cols(
  ..   `Serial Number` = col_double(),
  ..   `List Year` = col_double(),
  ..   `Date Recorded` = col_character(),
  ..   Town = col_character(),
  ..   Address = col_character(),
  ..   `Assessed Value` = col_double(),
  ..   `Sale Amount` = col_double(),
  ..   `Sales Ratio` = col_double(),
  ..   `Property Type` = col_character(),
  ..   `Residential Type` = col_character(),
  ..   `Non Use Code` = col_character(),
  ..   `Assessor Remarks` = col_character(),
  ..   `OPM remarks` = col_character(),
  ..   Location = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
Remove columns Serial number, Non Use Code, Assessor Remarks, Sales Ratio, and OPM Remarks
connecticut <- select(connecticut, 
                      "List Year", 
                      "Date Recorded", 
                      "Town", 
                      "Assessed Value", 
                      "Sale Amount", 
                      "Property Type", 
                      "Residential Type", 
                      "Location")
head(connecticut) #view first 6 rows of `connecticut`
# A tibble: 6 × 8
  `List Year` `Date Recorded` Town      `Assessed Value` `Sale Amount`
        <dbl> <chr>           <chr>                <dbl>         <dbl>
1        2014 08/06/2015      Stamford            503270        850000
2        2014 06/29/2015      New Haven            86030        149900
3        2014 07/01/2015      Ridgefie…           351880        570000
4        2014 04/30/2015      New Brit…           204680        261000
5        2014 06/26/2015      Stamford            229330        250000
6        2014 06/29/2015      West Har…           356300        600000
# … with 3 more variables: `Property Type` <chr>,
#   `Residential Type` <chr>, Location <chr>
Rename columns without spaces and for clarity
connecticut <- rename(connecticut, 
                      "list_year" = "List Year", 
                      "sale_date" = "Date Recorded", 
                      "assessed_price" = "Assessed Value", 
                      "sale_price" = "Sale Amount", 
                      "property_type" = "Property Type", 
                      "residential_type" = "Residential Type", 
                      "location" = "Location", 
                      "town" = "Town")

List and description of remaining variables

Potential Research Questions

Explore Data
head(connecticut)
# A tibble: 6 × 8
  list_year sale_date  town    assessed_price sale_price property_type
      <dbl> <chr>      <chr>            <dbl>      <dbl> <chr>        
1      2014 08/06/2015 Stamfo…         503270     850000 Residential  
2      2014 06/29/2015 New Ha…          86030     149900 Residential  
3      2014 07/01/2015 Ridgef…         351880     570000 Residential  
4      2014 04/30/2015 New Br…         204680     261000 Residential  
5      2014 06/26/2015 Stamfo…         229330     250000 Residential  
6      2014 06/29/2015 West H…         356300     600000 Residential  
# … with 2 more variables: residential_type <chr>, location <chr>
tail(connecticut)
# A tibble: 6 × 8
  list_year sale_date  town    assessed_price sale_price property_type
      <dbl> <chr>      <chr>            <dbl>      <dbl> <chr>        
1      2014 02/19/2015 Hamden          220080     402500 Residential  
2      2014 12/18/2014 Danbury         248500     428000 Residential  
3      2014 11/20/2014 Naugat…         175910     252500 Residential  
4      2014 01/21/2015 Bridge…         723600     466667 Commercial   
5      2014 06/18/2015 Madison         409200     580000 Residential  
6      2014 04/22/2015 Cheshi…         379100     520000 Residential  
# … with 2 more variables: residential_type <chr>, location <chr>
Convert sale_date from character to date
connecticut$sale_date <- as.Date(connecticut$sale_date, 
                                 format = "%m/%d/%Y" )
str(connecticut)
tibble [930,621 × 8] (S3: tbl_df/tbl/data.frame)
 $ list_year       : num [1:930621] 2014 2014 2014 2014 2014 ...
 $ sale_date       : Date[1:930621], format: "2015-08-06" ...
 $ town            : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
 $ assessed_price  : num [1:930621] 503270 86030 351880 204680 229330 ...
 $ sale_price      : num [1:930621] 850000 149900 570000 261000 250000 ...
 $ property_type   : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
 $ residential_type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
 $ location        : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
Explore property_type variable
tally(~property_type, data = connecticut)
property_type
             A     Apartments              C          C/I/U 
          2210           2922              3          11913 
    Commercial          Condo    Four Family     Industrial 
         19404           9932            237           2551 
Public Utility              R    Residential  Single Family 
            66         290287         494147          40518 
  Three Family     Two Family             UA              V 
          1228           2411             11          24479 
   Vacant Land           <NA> 
         23414           4888 
Filter property_type for Residential, Single Family, Two Family, Three Family, Condo
connecticut <- connecticut %>%
  filter(property_type %in% 
           c("Residential", 
             "Single Family", 
             "Two Family", 
             "Three Family", 
             "Condo"))
Examine first and last 75 rows of connecticut
head(connecticut, n = 75)
# A tibble: 75 × 8
   list_year sale_date  town   assessed_price sale_price property_type
       <dbl> <date>     <chr>           <dbl>      <dbl> <chr>        
 1      2014 2015-08-06 Stamf…         503270     850000 Residential  
 2      2014 2015-06-29 New H…          86030     149900 Residential  
 3      2014 2015-07-01 Ridge…         351880     570000 Residential  
 4      2014 2015-04-30 New B…         204680     261000 Residential  
 5      2014 2015-06-26 Stamf…         229330     250000 Residential  
 6      2014 2015-06-29 West …         356300     600000 Residential  
 7      2014 2014-10-17 Willi…         236060     332900 Residential  
 8      2014 2015-08-05 Trumb…         183900     240000 Residential  
 9      2014 2015-09-01 Water…          89610      75500 Residential  
10      2014 2014-12-02 Water…          75840      91000 Residential  
# … with 65 more rows, and 2 more variables: residential_type <chr>,
#   location <chr>
tail(connecticut, n = 75)
# A tibble: 75 × 8
   list_year sale_date  town   assessed_price sale_price property_type
       <dbl> <date>     <chr>           <dbl>      <dbl> <chr>        
 1      2014 2014-10-09 Groton         155610      83000 Residential  
 2      2014 2015-04-01 Manch…         112300     164800 Residential  
 3      2014 2015-07-30 Farmi…         449170     710000 Residential  
 4      2014 2015-04-28 Harwi…         146430     185000 Residential  
 5      2014 2015-05-19 Chesh…         266540     342000 Residential  
 6      2014 2015-09-21 Darien         549990    3395000 Residential  
 7      2014 2015-06-26 Hamden         148470     175000 Residential  
 8      2014 2015-07-21 Killi…          73850     124900 Residential  
 9      2014 2014-12-11 Brook…         240180     395000 Residential  
10      2014 2015-07-16 Merid…         114170     108500 Residential  
# … with 65 more rows, and 2 more variables: residential_type <chr>,
#   location <chr>
Get summary statistics of assessed_price and sale_price
summarize(connecticut, 
          mean_assessed_price = mean(assessed_price), 
          mean_sale_price = mean(sale_price),
          median_assessed_price = median(assessed_price),
          median_sale_price = median(sale_price),
          sd_assessed_price =sd(assessed_price),
          sd_sale_price = sd(sale_price))
# A tibble: 1 × 6
  mean_assessed_pri… mean_sale_price median_assessed… median_sale_pri…
               <dbl>           <dbl>            <dbl>            <dbl>
1            247960.         349358.           159755           230000
# … with 2 more variables: sd_assessed_price <dbl>,
#   sd_sale_price <dbl>
View favstats of assessed_price
favstats(~assessed_price, data = connecticut)
 min     Q1 median     Q3       max     mean      sd      n missing
   0 111000 159755 247270 881510000 247959.7 1292161 548236       0
View favstats of sale_price
options(scipen = 999) # remove scientific notation
favstats(~sale_price, data = connecticut)
 min     Q1 median     Q3       max     mean       sd      n missing
   0 147500 230000 360000 120000000 349357.8 636067.5 548236       0
Examine counts of property_type
table(select(connecticut, property_type))

        Condo   Residential Single Family  Three Family    Two Family 
         9932        494147         40518          1228          2411 
connecticut %>%
  count(property_type)
# A tibble: 5 × 2
  property_type      n
  <chr>          <int>
1 Condo           9932
2 Residential   494147
3 Single Family  40518
4 Three Family    1228
5 Two Family      2411
View proportions of property type
prop.table(table(select(connecticut, property_type)))

        Condo   Residential Single Family  Three Family    Two Family 
  0.018116286   0.901339934   0.073906128   0.002239911   0.004397741 
View counts of town variable
connecticut %>%
  count(town) %>%
  arrange(desc(n))
# A tibble: 170 × 2
   town              n
   <chr>         <int>
 1 Bridgeport    18239
 2 Waterbury     15533
 3 Stamford      14678
 4 Norwalk       12698
 5 Fairfield     11340
 6 West Hartford 10750
 7 Danbury       10588
 8 Greenwich     10559
 9 Hamden        10318
10 New Haven     10133
# … with 160 more rows
View proportions of town variable
options(scipen = 999) #remove scientific notation
prop.table(table(select(connecticut, town)))

   ***Unknown***          Andover          Ansonia          Ashford 
  0.000001824032   0.000835406650   0.004607504797   0.001194740951 
            Avon      Barkhamsted     Beacon Falls           Berlin 
  0.007234110857   0.000906543897   0.001935297937   0.005915335731 
         Bethany           Bethel        Bethlehem       Bloomfield 
  0.001324247222   0.005667267381   0.000793453914   0.005625314646 
          Bolton           Bozrah         Branford       Bridgeport 
  0.001406328661   0.000512552988   0.009603528407   0.033268519397 
     Bridgewater          Bristol       Brookfield         Brooklyn 
  0.000541737500   0.016241180805   0.005570593686   0.002657614604 
      Burlington           Canaan       Canterbury           Canton 
  0.003239480808   0.000383046717   0.001364375926   0.003553214309 
         Chaplin         Cheshire          Chester          Clinton 
  0.000516201052   0.008085933795   0.001008689688   0.005059864730 
      Colchester        Colebrook         Columbia         Cornwall 
  0.004158792929   0.000443239773   0.001455577525   0.000465128156 
        Coventry         Cromwell          Danbury           Darien 
  0.003947205218   0.004755251388   0.019312850670   0.007179389898 
      Deep River            Derby           Durham      East Granby 
  0.001459225589   0.003368987079   0.002115877104   0.002083044528 
     East Haddam     East Hampton    East Hartford       East Haven 
  0.003026069065   0.004981431354   0.014707169905   0.008452564224 
       East Lyme     East Windsor         Eastford           Easton 
  0.006807287373   0.003575102693   0.000390342845   0.002571885101 
       Ellington          Enfield            Essex        Fairfield 
  0.004799028156   0.011383783626   0.002588301388   0.020684522724 
      Farmington         Franklin      Glastonbury           Goshen 
  0.008974237372   0.000397638973   0.012649661824   0.001293238678 
          Granby        Greenwich         Griswold           Groton 
  0.003463836742   0.019259953743   0.003319738215   0.009545159384 
        Guilford           Haddam           Hamden          Hampton 
  0.007166621674   0.002570061069   0.018820362034   0.000454183965 
        Hartford         Hartland        Harwinton           Hebron 
  0.013371978491   0.000490664604   0.001384440278   0.002467915277 
            Kent        Killingly     Killingworth          Lebanon 
  0.001050642424   0.006103211026   0.001920705681   0.002103108880 
         Ledyard           Lisbon       Litchfield             Lyme 
  0.005395486615   0.001067058712   0.002497099789   0.000802574074 
         Madison       Manchester        Mansfield      Marlborough 
  0.006544626767   0.014645152817   0.004071239393   0.001906113426 
         Meriden       Middlebury      Middlefield       Middletown 
  0.017324655805   0.002394953998   0.001081650968   0.012003954501 
         Milford           Monroe        Montville           Morris 
  0.016611459299   0.006571987246   0.005152890361   0.000735084890 
       Naugatuck      New Britain       New Canaan    New Fairfield 
  0.008742585310   0.016055129543   0.007237758921   0.003974565698 
    New Hartford        New Haven       New London      New Milford 
  0.002104932912   0.018482916116   0.006210828913   0.009238722010 
       Newington          Newtown          Norfolk   North Branford 
  0.010900415150   0.007660934342   0.000567273948   0.003533149957 
    North Canaan      North Haven North Stonington          Norwalk 
  0.000777037626   0.007130141034   0.001482938005   0.023161558161 
         Norwich         Old Lyme     Old Saybrook           Orange 
  0.011091938508   0.003206648232   0.004394093055   0.003241304840 
          Oxford       Plainfield       Plainville         Plymouth 
  0.003976389730   0.004122312289   0.005346237752   0.003565982533 
         Pomfret         Portland          Preston         Prospect 
  0.001094419192   0.003296025799   0.001313303030   0.003166519528 
          Putnam          Redding       Ridgefield       Rocky Hill 
  0.002965876010   0.002455147053   0.008383251009   0.005731108501 
         Roxbury            Salem        Salisbury         Scotland 
  0.000811694234   0.001302358838   0.001568667508   0.000404935101 
         Seymour           Sharon          Shelton          Sherman 
  0.005255036152   0.001140019991   0.012691614560   0.001265878198 
        Simsbury           Somers    South Windsor        Southbury 
  0.007774024325   0.002783472811   0.008558358079   0.008458036320 
     Southington          Sprague         Stafford         Stamford 
  0.014865860688   0.000735084890   0.003462012710   0.026773141494 
        Sterling       Stonington        Stratford         Suffield 
  0.001130899831   0.006117803282   0.016883240064   0.004490766750 
       Thomaston         Thompson          Tolland       Torrington 
  0.002145061616   0.002754288299   0.003834115235   0.013375626555 
        Trumbull            Union           Vernon        Voluntown 
  0.010712539855   0.000184227231   0.008335826177   0.000795277946 
     Wallingford           Warren       Washington        Waterbury 
  0.015137641454   0.000474248316   0.001530362836   0.028332688842 
       Waterford        Watertown    West Hartford       West Haven 
  0.006809111405   0.005497632406   0.019608343852   0.013371978491 
       Westbrook           Weston         Westport     Wethersfield 
  0.002624782028   0.003347098695   0.010915007406   0.008211792002 
      Willington           Wilton       Winchester          Windham 
  0.001130899831   0.005380894359   0.003514909637   0.005324349368 
         Windsor    Windsor Locks          Wolcott       Woodbridge 
  0.008587542591   0.004317483711   0.004704178492   0.002683151052 
        Woodbury        Woodstock 
  0.003044309385   0.002850961994 
View summary statistics of different property_types
connecticut %>%
  group_by(property_type) %>%
  summarize(mean_sale_price_type = mean(sale_price),
            median_sale_price_type = median(sale_price),
            sd_sale_price_type = sd(sale_price))
# A tibble: 5 × 4
  property_type mean_sale_price_type median_sale_pri… sd_sale_price_t…
  <chr>                        <dbl>            <dbl>            <dbl>
1 Condo                      238111.           177900          236357.
2 Residential                346711.           226000          645755.
3 Single Family              418460.           280000          602036.
4 Three Family               240900.           225000          157237.
5 Two Family                 243985.           205000          219448.
View distribution of sale price, check for outliers
connecticut %>%
  group_by(property_type) %>%
  ggplot(aes(x = property_type, y = sale_price)) +
  geom_violin(notch = TRUE) +
  labs(title = "Distribution of Sale Price for Property Types",
       x = "Property Type",
       y = "Sale Price") +
  theme_light()

View distribution filtering sale price for under $2,500,000
connecticut %>%
  filter(sale_price < 2500000) %>%
  group_by(property_type) %>%
  ggplot(aes(x = property_type, y = sale_price)) +
  geom_violin(notch = TRUE) +
  labs(title = "Distribution of Sale Price Under $2,500,000",
       x = "Property Type",
       y = "Sale Price") +
  theme_bw()

Create a bargraph of median sale price of each town in Connecticut
connecticut %>%
  group_by(town) %>%
  summarize(median_sale_price = median(sale_price)) %>%
  ggplot(aes(x = town, y = median_sale_price)) +
  geom_bar(stat = "identity") +
  labs(title = "Median Sale Price of Towns in Connecticut", 
       y = "Price", 
       x = "Town") +
  theme_classic()

Group by year and display median sale price
connecticut %>%
  group_by(year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(median_yearly_sale_price = median(sale_price)) %>%
  arrange()
# A tibble: 20 × 2
   year       median_yearly_sale_price
   <date>                        <dbl>
 1 1999-01-01                   95000 
 2 2001-01-01                  425856.
 3 2003-01-01                  158900 
 4 2004-01-01                  286900 
 5 2005-01-01                  150000 
 6 2006-01-01                  260000 
 7 2007-01-01                  270000 
 8 2008-01-01                  240000 
 9 2009-01-01                  215000 
10 2010-01-01                  222100 
11 2011-01-01                  210000 
12 2012-01-01                  220000 
13 2013-01-01                  220500 
14 2014-01-01                  210000 
15 2015-01-01                  214000 
16 2016-01-01                  215000 
17 2017-01-01                  227000 
18 2018-01-01                  225000 
19 2019-01-01                  230000 
20 2020-01-01                  265000 
Group by year and arrange descending by year
connecticut %>%
  group_by(year = lubridate::floor_date(sale_date, "year")) %>%
  arrange(year)
# A tibble: 548,236 × 9
# Groups:   year [20]
   list_year sale_date  town   assessed_price sale_price property_type
       <dbl> <date>     <chr>           <dbl>      <dbl> <chr>        
 1      2017 1999-04-05 New L…          46690      95000 Residential  
 2      2017 2001-08-23 Shelt…         427910     558000 Residential  
 3      2017 2001-08-01 Shelt…         390810     635000 Residential  
 4      2006 2001-09-04 Newin…          64750      88000 Residential  
 5      2006 2001-10-27 Green…         573930     660000 Residential  
 6      2006 2001-09-25 Nauga…         183850     293711 Residential  
 7      2006 2001-09-05 Newin…         166110     256000 Residential  
 8      2006 2003-07-17 Nauga…          84200     158900 Residential  
 9      2006 2004-07-31 Newin…         246120     420000 Residential  
10      2006 2004-06-05 Brist…         147320     329900 Residential  
# … with 548,226 more rows, and 3 more variables:
#   residential_type <chr>, location <chr>, year <date>
Create an object of the towns with bottom ten median sale prices
median_sale_price_bottom_10 <-connecticut %>%
  group_by(town) %>%
  summarize(median_sale_price = median(sale_price)) %>%
  arrange(median_sale_price) %>%
  slice(1:10)
Create an object of the towns with the top ten median sale prices
median_sale_price_top_10 <-connecticut %>%
  group_by(town) %>%
  summarize(median_sale_price = median(sale_price)) %>%
  arrange(desc(median_sale_price)) %>%
  slice(1:10)
View top and bottom 10 towns
median_sale_price_bottom_10
# A tibble: 10 × 2
   town          median_sale_price
   <chr>                     <dbl>
 1 Waterbury                 94300
 2 Norwich                  126140
 3 Torrington               128500
 4 Hartford                 130000
 5 Windham                  130000
 6 New Britain              134000
 7 East Hartford            140000
 8 Winchester               141000
 9 New London               145000
10 Bridgeport               146000
median_sale_price_top_10
# A tibble: 10 × 2
   town       median_sale_price
   <chr>                  <dbl>
 1 Greenwich            1335000
 2 Darien               1300000
 3 New Canaan           1242591
 4 Westport             1100000
 5 Weston                799000
 6 Wilton                741000
 7 Ridgefield            614625
 8 Roxbury               600000
 9 Easton                577517
10 Redding               541975
Create Vector of top 10 towns
top_10 <- median_sale_price_top_10$town
Create a line graph of the top ten towns with the highest median sale prices
connecticut %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(median_yearly_sale_price = median(sale_price)) %>%
  filter(town %in% top_10) %>%
  ggplot(aes(x = year, y = median_yearly_sale_price)) +
  geom_line(aes(color = town)) +
  ggtitle("Towns in Connecticut with the Highest Median Sale Price: Top 10") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic()

Same as above only facet wrapped
connecticut %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(median_yearly_sale_price = median(sale_price)) %>%
  filter(town %in% top_10) %>%
  ggplot(aes(x = year, y = median_yearly_sale_price)) +
  geom_line(aes(color = town)) +
  ggtitle("Towns With Highest Median Sale Price: Top 10") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic() +
  facet_wrap(vars(town))

Homework 5

Create scatterplot of mean_monthly_sale_price grouped By property_type
connecticut %>%
  group_by(property_type, month = lubridate::floor_date(sale_date, "month")) %>%
  summarize(mean_monthy_sale_price = mean(sale_price)) %>%
  filter(mean_monthy_sale_price < 5000000) %>%
  ggplot(aes(x = month, y = mean_monthy_sale_price)) +
  geom_point((aes(color = property_type))) +
  ggtitle("Mean Monthly Sale Price of Property Types") +
  ylab("Mean Sale Price") +
  xlab("Time") +
  theme_linedraw()

Create new object connecticut2 with only residential property_types with a sale_price Under $2,500,000
connecticut2 <- connecticut %>%
  filter(property_type == "Residential") %>%
   filter(sale_price < 2500000) %>%
   select("list_year",
         "sale_date",
         "town",
         "assessed_price",
         "sale_price",
         "location") 
colnames(connecticut2)
[1] "list_year"      "sale_date"      "town"          
[4] "assessed_price" "sale_price"     "location"      
head(connecticut2)
# A tibble: 6 × 6
  list_year sale_date  town         assessed_price sale_price location
      <dbl> <date>     <chr>                 <dbl>      <dbl> <chr>   
1      2014 2015-08-06 Stamford             503270     850000 POINT (…
2      2014 2015-06-29 New Haven             86030     149900 POINT (…
3      2014 2015-07-01 Ridgefield           351880     570000 POINT (…
4      2014 2015-04-30 New Britain          204680     261000 POINT (…
5      2014 2015-06-26 Stamford             229330     250000 POINT (…
6      2014 2015-06-29 West Hartfo…         356300     600000 POINT (…
Download and view Connecticut map
con_map <- map_data("county", 
         "connecticut")
ggplot() +
  geom_polygon(data = con_map,
               aes(x = long, y = lat, group = group),
               color = "black", fill = "slateblue1")

View the counts of the subregion column
head(con_map, n = 10)
        long      lat group order      region subregion
1  -73.53341 41.67695     1     1 connecticut fairfield
2  -73.49902 41.66549     1     2 connecticut fairfield
3  -73.49902 41.65403     1     3 connecticut fairfield
4  -73.50475 41.63684     1     4 connecticut fairfield
5  -73.49329 41.60819     1     5 connecticut fairfield
6  -73.46465 41.55090     1     6 connecticut fairfield
7  -73.44746 41.49934     1     7 connecticut fairfield
8  -73.39017 41.51079     1     8 connecticut fairfield
9  -73.37297 41.49360     1     9 connecticut fairfield
10 -73.32714 41.48214     1    10 connecticut fairfield
con_map %>%
  count(subregion)
   subregion  n
1  fairfield 51
2   hartford 57
3 litchfield 32
4  middlesex 36
5  new haven 70
6 new london 46
7    tolland 34
8    windham 20
Remove the word “POINT” from location Column
connecticut2 <- connecticut2%>%
    mutate_at("location", 
              str_replace, 
              "POINT", 
              "") %>%
    mutate_at("location", 
              str_replace, 
              "[ ]", 
              "") %>%
    mutate_at("location", 
              str_replace, 
              "[)]", 
              "")
head(connecticut2, n = 75)
# A tibble: 75 × 6
   list_year sale_date  town        assessed_price sale_price location
       <dbl> <date>     <chr>                <dbl>      <dbl> <chr>   
 1      2014 2015-08-06 Stamford            503270     850000 (-73.52…
 2      2014 2015-06-29 New Haven            86030     149900 (-72.87…
 3      2014 2015-07-01 Ridgefield          351880     570000 (-73.50…
 4      2014 2015-04-30 New Britain         204680     261000 (-72.77…
 5      2014 2015-06-26 Stamford            229330     250000 (-73.51…
 6      2014 2015-06-29 West Hartf…         356300     600000 (-72.76…
 7      2014 2014-10-17 Willington          236060     332900 (-72.22…
 8      2014 2015-08-05 Trumbull            183900     240000 (-73.21…
 9      2014 2015-09-01 Waterford            89610      75500 <NA>    
10      2014 2014-12-02 Waterbury            75840      91000 (-73.04…
# … with 65 more rows
Remove NAs from connecticut2
connecticut2 <- 
connecticut2 %>%
  na.omit(location) 
head(connecticut2, n = 25)
# A tibble: 25 × 6
   list_year sale_date  town        assessed_price sale_price location
       <dbl> <date>     <chr>                <dbl>      <dbl> <chr>   
 1      2014 2015-08-06 Stamford            503270     850000 (-73.52…
 2      2014 2015-06-29 New Haven            86030     149900 (-72.87…
 3      2014 2015-07-01 Ridgefield          351880     570000 (-73.50…
 4      2014 2015-04-30 New Britain         204680     261000 (-72.77…
 5      2014 2015-06-26 Stamford            229330     250000 (-73.51…
 6      2014 2015-06-29 West Hartf…         356300     600000 (-72.76…
 7      2014 2014-10-17 Willington          236060     332900 (-72.22…
 8      2014 2015-08-05 Trumbull            183900     240000 (-73.21…
 9      2014 2014-12-02 Waterbury            75840      91000 (-73.04…
10      2014 2015-07-01 Trumbull            178500     175000 (-73.22…
# … with 15 more rows
Seperate location into two columns: lat and ’long`
connecticut2 <- 
  connecticut2 %>%
  separate(location, c("long","lat"), " ")
head(connecticut2)
# A tibble: 6 × 7
  list_year sale_date  town      assessed_price sale_price long  lat  
      <dbl> <date>     <chr>              <dbl>      <dbl> <chr> <chr>
1      2014 2015-08-06 Stamford          503270     850000 (-73… 41.0…
2      2014 2015-06-29 New Haven          86030     149900 (-72… 41.3…
3      2014 2015-07-01 Ridgefie…         351880     570000 (-73… 41.2…
4      2014 2015-04-30 New Brit…         204680     261000 (-72… 41.7…
5      2014 2015-06-26 Stamford          229330     250000 (-73… 41.0…
6      2014 2015-06-29 West Har…         356300     600000 (-72… 41.7…
Remove remaining symbols from long column
connecticut2 <-
connecticut2 %>%
  mutate_at("long", str_replace, "[(]", "") %>%
  mutate_at("long", str_replace, "[ ]", "")
View connecticut2: first, last 100 rows
head(connecticut2, n = 100)
# A tibble: 100 × 7
   list_year sale_date  town     assessed_price sale_price long  lat  
       <dbl> <date>     <chr>             <dbl>      <dbl> <chr> <chr>
 1      2014 2015-08-06 Stamford         503270     850000 -73.… 41.0…
 2      2014 2015-06-29 New Hav…          86030     149900 -72.… 41.3…
 3      2014 2015-07-01 Ridgefi…         351880     570000 -73.… 41.2…
 4      2014 2015-04-30 New Bri…         204680     261000 -72.… 41.7…
 5      2014 2015-06-26 Stamford         229330     250000 -73.… 41.0…
 6      2014 2015-06-29 West Ha…         356300     600000 -72.… 41.7…
 7      2014 2014-10-17 Willing…         236060     332900 -72.… 41.8…
 8      2014 2015-08-05 Trumbull         183900     240000 -73.… 41.2…
 9      2014 2014-12-02 Waterbu…          75840      91000 -73.… 41.5…
10      2014 2015-07-01 Trumbull         178500     175000 -73.… 41.2…
# … with 90 more rows
tail(connecticut2, n = 100)
# A tibble: 100 × 7
   list_year sale_date  town     assessed_price sale_price long  lat  
       <dbl> <date>     <chr>             <dbl>      <dbl> <chr> <chr>
 1      2014 2015-09-28 Bristol          257600     387000 -72.… 41.7…
 2      2014 2015-02-23 Glaston…         179900     252500 -72.… 41.7…
 3      2014 2014-12-15 Danbury          238800     431000 -73.… 41.4…
 4      2014 2015-02-09 Berlin           191900     317000 -72.… 41.6…
 5      2014 2014-10-01 Madison          172400     315000 -72.… 41.2…
 6      2014 2015-02-26 East Wi…          22540      39000 -72.… 41.9…
 7      2014 2015-04-06 Bridgep…          85950      55000 -73.… 41.2…
 8      2014 2015-09-02 Fairfie…         436660     530000 -73.… 41.1…
 9      2014 2015-05-26 Milford          180730     309000 -73.… 41.2…
10      2014 2014-11-06 Fairfie…         239190     385000 -73.… 41.1…
# … with 90 more rows
Convert lat and long to numeric variables
connecticut2$long <- as.numeric(connecticut2$`long`)
connecticut2$lat <- as.numeric(connecticut2$`lat`)
connecticut2
# A tibble: 379,435 × 7
   list_year sale_date  town     assessed_price sale_price  long   lat
       <dbl> <date>     <chr>             <dbl>      <dbl> <dbl> <dbl>
 1      2014 2015-08-06 Stamford         503270     850000 -73.5  41.1
 2      2014 2015-06-29 New Hav…          86030     149900 -72.9  41.3
 3      2014 2015-07-01 Ridgefi…         351880     570000 -73.5  41.3
 4      2014 2015-04-30 New Bri…         204680     261000 -72.8  41.7
 5      2014 2015-06-26 Stamford         229330     250000 -73.5  41.1
 6      2014 2015-06-29 West Ha…         356300     600000 -72.8  41.8
 7      2014 2014-10-17 Willing…         236060     332900 -72.2  41.8
 8      2014 2015-08-05 Trumbull         183900     240000 -73.2  41.2
 9      2014 2014-12-02 Waterbu…          75840      91000 -73.0  41.5
10      2014 2015-07-01 Trumbull         178500     175000 -73.2  41.2
# … with 379,425 more rows
Attempt and FAIL to merge con_map and connecticut2 at the lat column
merged_con2 <-
  left_join(connecticut2, con_map, by = "lat")
merged_con2
# A tibble: 379,435 × 12
   list_year sale_date  town    assessed_price sale_price long.x   lat
       <dbl> <date>     <chr>            <dbl>      <dbl>  <dbl> <dbl>
 1      2014 2015-08-06 Stamfo…         503270     850000  -73.5  41.1
 2      2014 2015-06-29 New Ha…          86030     149900  -72.9  41.3
 3      2014 2015-07-01 Ridgef…         351880     570000  -73.5  41.3
 4      2014 2015-04-30 New Br…         204680     261000  -72.8  41.7
 5      2014 2015-06-26 Stamfo…         229330     250000  -73.5  41.1
 6      2014 2015-06-29 West H…         356300     600000  -72.8  41.8
 7      2014 2014-10-17 Willin…         236060     332900  -72.2  41.8
 8      2014 2015-08-05 Trumbu…         183900     240000  -73.2  41.2
 9      2014 2014-12-02 Waterb…          75840      91000  -73.0  41.5
10      2014 2015-07-01 Trumbu…         178500     175000  -73.2  41.2
# … with 379,425 more rows, and 5 more variables: long.y <dbl>,
#   group <dbl>, order <int>, region <chr>, subregion <chr>
Remove certian latitudes and longitudes from connecticut2 that were nonsensical
connecticut2 <- connecticut2 %>%
  filter(lat > 40 & lat < 42.5) %>% 
  filter(long > -74 & long < -72.5)
Arrange lat column in descending order
connecticut2 %>%
  arrange(desc(lat, n = 25))
# A tibble: 321,428 × 7
   list_year sale_date  town     assessed_price sale_price  long   lat
       <dbl> <date>     <chr>             <dbl>      <dbl> <dbl> <dbl>
 1      2006 2007-04-24 Salisbu…         307020     405000 -73.4  42.0
 2      2009 2009-12-04 Salisbu…         569280     705000 -73.4  42.0
 3      2011 2012-05-25 Salisbu…         375800     502000 -73.4  42.0
 4      2011 2012-04-30 Salisbu…         375000     325000 -73.4  42.0
 5      2010 2010-12-20 North C…         273070     282000 -73.3  42.0
 6      2013 2014-02-27 North C…         153630      80000 -73.3  42.0
 7      2008 2008-10-29 North C…         129600     169000 -73.3  42.0
 8      2006 2007-06-28 North C…          47180      88000 -73.3  42.0
 9      2011 2012-01-05 North C…          75170      75000 -73.3  42.0
10      2017 2018-07-24 Salisbu…         547300     920000 -73.4  42.0
# … with 321,418 more rows
Map Failure
##  ggplot() +
##    geom_polygon(data = con3,
##             aes(x = long, 
##                 y = lat, 
##                 group = town),
##             color = "black", fill = "slateblue1") 
Create of object of mean sale price for each town in 2008
mean_2008 <- connecticut2 %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  select(year, 
         sale_price, 
         town) %>%
  summarize(mean_3 = mean(sale_price)) %>%
  filter(year == "2008-01-01")
Create of object of mean sale price for each town in 2018
mean_2018 <- connecticut2 %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  select(year, 
         sale_price, 
         town) %>%
  summarize(mean_3 = mean(sale_price)) %>%
  filter(year == "2018-01-01")
View objects mean_2008 and mean_2018
mean_2008
# A tibble: 120 × 3
# Groups:   town [120]
   town         year        mean_3
   <chr>        <date>       <dbl>
 1 Ansonia      2008-01-01 221402.
 2 Avon         2008-01-01 485537.
 3 Barkhamsted  2008-01-01 274033.
 4 Beacon Falls 2008-01-01 312228.
 5 Berlin       2008-01-01 283870.
 6 Bethany      2008-01-01 436891.
 7 Bethel       2008-01-01 296576.
 8 Bethlehem    2008-01-01 288259.
 9 Bloomfield   2008-01-01 271202.
10 Branford     2008-01-01 315394.
# … with 110 more rows
mean_2018
# A tibble: 122 × 3
# Groups:   town [122]
   town         year        mean_3
   <chr>        <date>       <dbl>
 1 Ansonia      2018-01-01 193599.
 2 Avon         2018-01-01 389546.
 3 Barkhamsted  2018-01-01 301730.
 4 Beacon Falls 2018-01-01 214421.
 5 Berlin       2018-01-01 183557 
 6 Bethany      2018-01-01 358050 
 7 Bethel       2018-01-01 325866.
 8 Bethlehem    2018-01-01 302177.
 9 Bloomfield   2018-01-01 210893.
10 Branford     2018-01-01 292809.
# … with 112 more rows
Jion mean_2008 and mean_2018 and create new column Delta_mean
combined <- mean_2008 %>%
  left_join(mean_2018, by = "town") %>%
  mutate(delta_mean = mean_3.y - mean_3.x) 
head(combined, n = 25)
# A tibble: 25 × 6
# Groups:   town [25]
   town         year.x     mean_3.x year.y     mean_3.y delta_mean
   <chr>        <date>        <dbl> <date>        <dbl>      <dbl>
 1 Ansonia      2008-01-01  221402. 2018-01-01  193599.    -27803.
 2 Avon         2008-01-01  485537. 2018-01-01  389546.    -95991.
 3 Barkhamsted  2008-01-01  274033. 2018-01-01  301730.     27696.
 4 Beacon Falls 2008-01-01  312228. 2018-01-01  214421.    -97807.
 5 Berlin       2008-01-01  283870. 2018-01-01  183557    -100313.
 6 Bethany      2008-01-01  436891. 2018-01-01  358050     -78841.
 7 Bethel       2008-01-01  296576. 2018-01-01  325866.     29290.
 8 Bethlehem    2008-01-01  288259. 2018-01-01  302177.     13918.
 9 Bloomfield   2008-01-01  271202. 2018-01-01  210893.    -60309.
10 Branford     2008-01-01  315394. 2018-01-01  292809.    -22585.
# … with 15 more rows
combined %>%
  arrange(delta_mean) 
# A tibble: 120 × 6
# Groups:   town [120]
   town        year.x     mean_3.x year.y     mean_3.y delta_mean
   <chr>       <date>        <dbl> <date>        <dbl>      <dbl>
 1 Weston      2008-01-01 1039698. 2018-01-01  766908.   -272790.
 2 Washington  2008-01-01  707150  2018-01-01  450570.   -256580.
 3 Bridgewater 2008-01-01  662900  2018-01-01  461750    -201150 
 4 Easton      2008-01-01  741237. 2018-01-01  566425.   -174813.
 5 Canaan      2008-01-01  360000  2018-01-01  186514.   -173486.
 6 New Canaan  2008-01-01 1311972. 2018-01-01 1184363.   -127610.
 7 Westport    2008-01-01 1211626. 2018-01-01 1101054.   -110572.
 8 Oxford      2008-01-01  413123. 2018-01-01  304140.   -108983.
 9 Berlin      2008-01-01  283870. 2018-01-01  183557    -100313.
10 Haddam      2008-01-01  354452. 2018-01-01  256267.    -98184.
# … with 110 more rows

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Farrell (2022, March 6). Data Analytics and Computational Social Science: Homework 5: Incomplete. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211873035/

BibTeX citation

@misc{farrell2022homework,
  author = {Farrell, Joseph},
  title = {Data Analytics and Computational Social Science: Homework 5: Incomplete},
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211873035/},
  year = {2022}
}