Homework 4

Connecticut Estate Sales Data

Joseph Farrell
2022-02-25
Load Libraries
Read in data
connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
Check variables of connecticut
str(connecticut)
spec_tbl_df [930,621 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Serial Number   : num [1:930621] 141466 140604 14340 140455 141195 ...
 $ List Year       : num [1:930621] 2014 2014 2014 2014 2014 ...
 $ Date Recorded   : chr [1:930621] "08/06/2015" "06/29/2015" "07/01/2015" "04/30/2015" ...
 $ Town            : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
 $ Address         : chr [1:930621] "83 OVERBROOK DRIVE" "56 HIGHVIEW LANE" "32 OVERLOOK DR" "171 BRADFORD WALK" ...
 $ Assessed Value  : num [1:930621] 503270 86030 351880 204680 229330 ...
 $ Sale Amount     : num [1:930621] 850000 149900 570000 261000 250000 ...
 $ Sales Ratio     : num [1:930621] 0.592 0.574 0.617 0.784 0.917 ...
 $ Property Type   : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
 $ Residential Type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
 $ Non Use Code    : chr [1:930621] NA NA NA NA ...
 $ Assessor Remarks: chr [1:930621] NA NA NA NA ...
 $ OPM remarks     : chr [1:930621] NA NA NA NA ...
 $ Location        : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
 - attr(*, "spec")=
  .. cols(
  ..   `Serial Number` = col_double(),
  ..   `List Year` = col_double(),
  ..   `Date Recorded` = col_character(),
  ..   Town = col_character(),
  ..   Address = col_character(),
  ..   `Assessed Value` = col_double(),
  ..   `Sale Amount` = col_double(),
  ..   `Sales Ratio` = col_double(),
  ..   `Property Type` = col_character(),
  ..   `Residential Type` = col_character(),
  ..   `Non Use Code` = col_character(),
  ..   `Assessor Remarks` = col_character(),
  ..   `OPM remarks` = col_character(),
  ..   Location = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
Remove columns Serial number, Non Use Code, Assessor Remarks, Sales Ratio, and OPM Remarks
connecticut <- select(connecticut, 
                      "List Year", 
                      "Date Recorded", 
                      "Town", 
                      "Assessed Value", 
                      "Sale Amount", 
                      "Property Type", 
                      "Residential Type", 
                      "Location")
head(connecticut) #view first 6 rows of `connecticut`
# A tibble: 6 × 8
  `List Year` `Date Recorded` Town      `Assessed Value` `Sale Amount`
        <dbl> <chr>           <chr>                <dbl>         <dbl>
1        2014 08/06/2015      Stamford            503270        850000
2        2014 06/29/2015      New Haven            86030        149900
3        2014 07/01/2015      Ridgefie…           351880        570000
4        2014 04/30/2015      New Brit…           204680        261000
5        2014 06/26/2015      Stamford            229330        250000
6        2014 06/29/2015      West Har…           356300        600000
# … with 3 more variables: `Property Type` <chr>,
#   `Residential Type` <chr>, Location <chr>
Rename colunms without spaces and for clarity
connecticut <- rename(connecticut, 
                      "list_year" = "List Year", 
                      "sale_date" = "Date Recorded", 
                      "assessed_price" = "Assessed Value", 
                      "sale_price" = "Sale Amount", 
                      "property_type" = "Property Type", 
                      "residential_type" = "Residential Type", 
                      "location" = "Location", 
                      "town" = "Town")

List and description of remaining variables

Potential Research Questions

Explore Data
head(connecticut)
# A tibble: 6 × 8
  list_year sale_date  town    assessed_price sale_price property_type
      <dbl> <chr>      <chr>            <dbl>      <dbl> <chr>        
1      2014 08/06/2015 Stamfo…         503270     850000 Residential  
2      2014 06/29/2015 New Ha…          86030     149900 Residential  
3      2014 07/01/2015 Ridgef…         351880     570000 Residential  
4      2014 04/30/2015 New Br…         204680     261000 Residential  
5      2014 06/26/2015 Stamfo…         229330     250000 Residential  
6      2014 06/29/2015 West H…         356300     600000 Residential  
# … with 2 more variables: residential_type <chr>, location <chr>
tail(connecticut)
# A tibble: 6 × 8
  list_year sale_date  town    assessed_price sale_price property_type
      <dbl> <chr>      <chr>            <dbl>      <dbl> <chr>        
1      2014 02/19/2015 Hamden          220080     402500 Residential  
2      2014 12/18/2014 Danbury         248500     428000 Residential  
3      2014 11/20/2014 Naugat…         175910     252500 Residential  
4      2014 01/21/2015 Bridge…         723600     466667 Commercial   
5      2014 06/18/2015 Madison         409200     580000 Residential  
6      2014 04/22/2015 Cheshi…         379100     520000 Residential  
# … with 2 more variables: residential_type <chr>, location <chr>
Convert sale_date from character to date
connecticut$sale_date <- as.Date(connecticut$sale_date, 
                                 format = "%m/%d/%Y" )
str(connecticut)
tibble [930,621 × 8] (S3: tbl_df/tbl/data.frame)
 $ list_year       : num [1:930621] 2014 2014 2014 2014 2014 ...
 $ sale_date       : Date[1:930621], format: "2015-08-06" ...
 $ town            : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
 $ assessed_price  : num [1:930621] 503270 86030 351880 204680 229330 ...
 $ sale_price      : num [1:930621] 850000 149900 570000 261000 250000 ...
 $ property_type   : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
 $ residential_type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
 $ location        : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
Explore property_type variable
tally(~property_type, data = connecticut)
property_type
             A     Apartments              C          C/I/U 
          2210           2922              3          11913 
    Commercial          Condo    Four Family     Industrial 
         19404           9932            237           2551 
Public Utility              R    Residential  Single Family 
            66         290287         494147          40518 
  Three Family     Two Family             UA              V 
          1228           2411             11          24479 
   Vacant Land           <NA> 
         23414           4888 
Filter property_type for Residential, Single Family, Two Family, Three Family, Condo
connecticut <- connecticut %>%
  filter(`property_type` %in% 
           c("Residential", 
             "Single Family", 
             "Two Family", 
             "Three Family", 
             "Condo"))
Examine first and last 75 rows of connecticut
head(connecticut, n = 75)
# A tibble: 75 × 8
   list_year sale_date  town   assessed_price sale_price property_type
       <dbl> <date>     <chr>           <dbl>      <dbl> <chr>        
 1      2014 2015-08-06 Stamf…         503270     850000 Residential  
 2      2014 2015-06-29 New H…          86030     149900 Residential  
 3      2014 2015-07-01 Ridge…         351880     570000 Residential  
 4      2014 2015-04-30 New B…         204680     261000 Residential  
 5      2014 2015-06-26 Stamf…         229330     250000 Residential  
 6      2014 2015-06-29 West …         356300     600000 Residential  
 7      2014 2014-10-17 Willi…         236060     332900 Residential  
 8      2014 2015-08-05 Trumb…         183900     240000 Residential  
 9      2014 2015-09-01 Water…          89610      75500 Residential  
10      2014 2014-12-02 Water…          75840      91000 Residential  
# … with 65 more rows, and 2 more variables: residential_type <chr>,
#   location <chr>
tail(connecticut, n = 75)
# A tibble: 75 × 8
   list_year sale_date  town   assessed_price sale_price property_type
       <dbl> <date>     <chr>           <dbl>      <dbl> <chr>        
 1      2014 2014-10-09 Groton         155610      83000 Residential  
 2      2014 2015-04-01 Manch…         112300     164800 Residential  
 3      2014 2015-07-30 Farmi…         449170     710000 Residential  
 4      2014 2015-04-28 Harwi…         146430     185000 Residential  
 5      2014 2015-05-19 Chesh…         266540     342000 Residential  
 6      2014 2015-09-21 Darien         549990    3395000 Residential  
 7      2014 2015-06-26 Hamden         148470     175000 Residential  
 8      2014 2015-07-21 Killi…          73850     124900 Residential  
 9      2014 2014-12-11 Brook…         240180     395000 Residential  
10      2014 2015-07-16 Merid…         114170     108500 Residential  
# … with 65 more rows, and 2 more variables: residential_type <chr>,
#   location <chr>
Get summary statistics of assessed_price and sale_price
summarize(connecticut, 
          mean_assessed_price = mean(assessed_price), 
          mean_sale_price = mean(sale_price),
          median_assessed_price = median(assessed_price),
          median_sale_price = median(sale_price),
          sd_assessed_price =sd(assessed_price),
          sd_sale_price = sd(sale_price))
# A tibble: 1 × 6
  mean_assessed_pri… mean_sale_price median_assessed… median_sale_pri…
               <dbl>           <dbl>            <dbl>            <dbl>
1            247960.         349358.           159755           230000
# … with 2 more variables: sd_assessed_price <dbl>,
#   sd_sale_price <dbl>
View favstats of assessed_price
favstats(~assessed_price, data = connecticut)
 min     Q1 median     Q3       max     mean      sd      n missing
   0 111000 159755 247270 881510000 247959.7 1292161 548236       0
View favstats of sale_price
options(scipen = 999) # remove scientific notation
favstats(~sale_price, data = connecticut)
 min     Q1 median     Q3       max     mean       sd      n missing
   0 147500 230000 360000 120000000 349357.8 636067.5 548236       0
Examine counts of property_type
table(select(connecticut, property_type))

        Condo   Residential Single Family  Three Family    Two Family 
         9932        494147         40518          1228          2411 
connecticut %>%
  count(property_type)
# A tibble: 5 × 2
  property_type      n
  <chr>          <int>
1 Condo           9932
2 Residential   494147
3 Single Family  40518
4 Three Family    1228
5 Two Family      2411
View proportions of property type
prop.table(table(select(connecticut, property_type)))

        Condo   Residential Single Family  Three Family    Two Family 
  0.018116286   0.901339934   0.073906128   0.002239911   0.004397741 
View counts of town variable
connecticut %>%
  count(town) %>%
  arrange(desc(n))
# A tibble: 170 × 2
   town              n
   <chr>         <int>
 1 Bridgeport    18239
 2 Waterbury     15533
 3 Stamford      14678
 4 Norwalk       12698
 5 Fairfield     11340
 6 West Hartford 10750
 7 Danbury       10588
 8 Greenwich     10559
 9 Hamden        10318
10 New Haven     10133
# … with 160 more rows
View proportions of town variable
options(scipen = 999) #remove scientific notation
prop.table(table(select(connecticut, town)))

   ***Unknown***          Andover          Ansonia          Ashford 
  0.000001824032   0.000835406650   0.004607504797   0.001194740951 
            Avon      Barkhamsted     Beacon Falls           Berlin 
  0.007234110857   0.000906543897   0.001935297937   0.005915335731 
         Bethany           Bethel        Bethlehem       Bloomfield 
  0.001324247222   0.005667267381   0.000793453914   0.005625314646 
          Bolton           Bozrah         Branford       Bridgeport 
  0.001406328661   0.000512552988   0.009603528407   0.033268519397 
     Bridgewater          Bristol       Brookfield         Brooklyn 
  0.000541737500   0.016241180805   0.005570593686   0.002657614604 
      Burlington           Canaan       Canterbury           Canton 
  0.003239480808   0.000383046717   0.001364375926   0.003553214309 
         Chaplin         Cheshire          Chester          Clinton 
  0.000516201052   0.008085933795   0.001008689688   0.005059864730 
      Colchester        Colebrook         Columbia         Cornwall 
  0.004158792929   0.000443239773   0.001455577525   0.000465128156 
        Coventry         Cromwell          Danbury           Darien 
  0.003947205218   0.004755251388   0.019312850670   0.007179389898 
      Deep River            Derby           Durham      East Granby 
  0.001459225589   0.003368987079   0.002115877104   0.002083044528 
     East Haddam     East Hampton    East Hartford       East Haven 
  0.003026069065   0.004981431354   0.014707169905   0.008452564224 
       East Lyme     East Windsor         Eastford           Easton 
  0.006807287373   0.003575102693   0.000390342845   0.002571885101 
       Ellington          Enfield            Essex        Fairfield 
  0.004799028156   0.011383783626   0.002588301388   0.020684522724 
      Farmington         Franklin      Glastonbury           Goshen 
  0.008974237372   0.000397638973   0.012649661824   0.001293238678 
          Granby        Greenwich         Griswold           Groton 
  0.003463836742   0.019259953743   0.003319738215   0.009545159384 
        Guilford           Haddam           Hamden          Hampton 
  0.007166621674   0.002570061069   0.018820362034   0.000454183965 
        Hartford         Hartland        Harwinton           Hebron 
  0.013371978491   0.000490664604   0.001384440278   0.002467915277 
            Kent        Killingly     Killingworth          Lebanon 
  0.001050642424   0.006103211026   0.001920705681   0.002103108880 
         Ledyard           Lisbon       Litchfield             Lyme 
  0.005395486615   0.001067058712   0.002497099789   0.000802574074 
         Madison       Manchester        Mansfield      Marlborough 
  0.006544626767   0.014645152817   0.004071239393   0.001906113426 
         Meriden       Middlebury      Middlefield       Middletown 
  0.017324655805   0.002394953998   0.001081650968   0.012003954501 
         Milford           Monroe        Montville           Morris 
  0.016611459299   0.006571987246   0.005152890361   0.000735084890 
       Naugatuck      New Britain       New Canaan    New Fairfield 
  0.008742585310   0.016055129543   0.007237758921   0.003974565698 
    New Hartford        New Haven       New London      New Milford 
  0.002104932912   0.018482916116   0.006210828913   0.009238722010 
       Newington          Newtown          Norfolk   North Branford 
  0.010900415150   0.007660934342   0.000567273948   0.003533149957 
    North Canaan      North Haven North Stonington          Norwalk 
  0.000777037626   0.007130141034   0.001482938005   0.023161558161 
         Norwich         Old Lyme     Old Saybrook           Orange 
  0.011091938508   0.003206648232   0.004394093055   0.003241304840 
          Oxford       Plainfield       Plainville         Plymouth 
  0.003976389730   0.004122312289   0.005346237752   0.003565982533 
         Pomfret         Portland          Preston         Prospect 
  0.001094419192   0.003296025799   0.001313303030   0.003166519528 
          Putnam          Redding       Ridgefield       Rocky Hill 
  0.002965876010   0.002455147053   0.008383251009   0.005731108501 
         Roxbury            Salem        Salisbury         Scotland 
  0.000811694234   0.001302358838   0.001568667508   0.000404935101 
         Seymour           Sharon          Shelton          Sherman 
  0.005255036152   0.001140019991   0.012691614560   0.001265878198 
        Simsbury           Somers    South Windsor        Southbury 
  0.007774024325   0.002783472811   0.008558358079   0.008458036320 
     Southington          Sprague         Stafford         Stamford 
  0.014865860688   0.000735084890   0.003462012710   0.026773141494 
        Sterling       Stonington        Stratford         Suffield 
  0.001130899831   0.006117803282   0.016883240064   0.004490766750 
       Thomaston         Thompson          Tolland       Torrington 
  0.002145061616   0.002754288299   0.003834115235   0.013375626555 
        Trumbull            Union           Vernon        Voluntown 
  0.010712539855   0.000184227231   0.008335826177   0.000795277946 
     Wallingford           Warren       Washington        Waterbury 
  0.015137641454   0.000474248316   0.001530362836   0.028332688842 
       Waterford        Watertown    West Hartford       West Haven 
  0.006809111405   0.005497632406   0.019608343852   0.013371978491 
       Westbrook           Weston         Westport     Wethersfield 
  0.002624782028   0.003347098695   0.010915007406   0.008211792002 
      Willington           Wilton       Winchester          Windham 
  0.001130899831   0.005380894359   0.003514909637   0.005324349368 
         Windsor    Windsor Locks          Wolcott       Woodbridge 
  0.008587542591   0.004317483711   0.004704178492   0.002683151052 
        Woodbury        Woodstock 
  0.003044309385   0.002850961994 
View summary statistics of different property_types
connecticut %>%
  group_by(property_type) %>%
  summarize(mean_sale_price_type = mean(sale_price),
            median_sale_price_type = median(sale_price),
            sd_sale_price_type = sd(sale_price))
# A tibble: 5 × 4
  property_type mean_sale_price_type median_sale_pri… sd_sale_price_t…
  <chr>                        <dbl>            <dbl>            <dbl>
1 Condo                      238111.           177900          236357.
2 Residential                346711.           226000          645755.
3 Single Family              418460.           280000          602036.
4 Three Family               240900.           225000          157237.
5 Two Family                 243985.           205000          219448.
View distribution of sale price, check for outliers
connecticut %>%
  group_by(property_type) %>%
  ggplot(aes(x = property_type, y = sale_price)) +
  geom_violin(notch = TRUE) +
  labs(title = "Distribution of Sale Price for Property Types",
       x = "Property Type",
       y = "Sale Price") +
  theme_light()

View distribution filtering sale price for under $2,500,000
connecticut %>%
  filter(`sale_price` < 2500000) %>%
  group_by(property_type) %>%
  ggplot(aes(x = property_type, y = sale_price)) +
  geom_violin(notch = TRUE) +
  labs(title = "Distribution of Sale Price Under $2,500,000",
       x = "Property Type",
       y = "Sale Price") +
  theme_bw()

Create a bargraph of median sale price of each town in Connecticut
connecticut %>%
  group_by(town) %>%
  summarize(median_sale_price = median(sale_price)) %>%
  ggplot(aes(x = town, y = median_sale_price)) +
  geom_bar(stat = "identity") +
  labs(title = "Median Sale Price of Towns in Connecticut", 
       y = "Price", 
       x = "Town") +
  theme_classic()

Group by year and display median sale price
connecticut %>%
  group_by(year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(median_yearly_sale_price = median(sale_price)) %>%
  arrange()
# A tibble: 20 × 2
   year       median_yearly_sale_price
   <date>                        <dbl>
 1 1999-01-01                   95000 
 2 2001-01-01                  425856.
 3 2003-01-01                  158900 
 4 2004-01-01                  286900 
 5 2005-01-01                  150000 
 6 2006-01-01                  260000 
 7 2007-01-01                  270000 
 8 2008-01-01                  240000 
 9 2009-01-01                  215000 
10 2010-01-01                  222100 
11 2011-01-01                  210000 
12 2012-01-01                  220000 
13 2013-01-01                  220500 
14 2014-01-01                  210000 
15 2015-01-01                  214000 
16 2016-01-01                  215000 
17 2017-01-01                  227000 
18 2018-01-01                  225000 
19 2019-01-01                  230000 
20 2020-01-01                  265000 
Group by year and arrange descending by year
connecticut %>%
  group_by(year = lubridate::floor_date(sale_date, "year")) %>%
  arrange(year)
# A tibble: 548,236 × 9
# Groups:   year [20]
   list_year sale_date  town   assessed_price sale_price property_type
       <dbl> <date>     <chr>           <dbl>      <dbl> <chr>        
 1      2017 1999-04-05 New L…          46690      95000 Residential  
 2      2017 2001-08-23 Shelt…         427910     558000 Residential  
 3      2017 2001-08-01 Shelt…         390810     635000 Residential  
 4      2006 2001-09-04 Newin…          64750      88000 Residential  
 5      2006 2001-10-27 Green…         573930     660000 Residential  
 6      2006 2001-09-25 Nauga…         183850     293711 Residential  
 7      2006 2001-09-05 Newin…         166110     256000 Residential  
 8      2006 2003-07-17 Nauga…          84200     158900 Residential  
 9      2006 2004-07-31 Newin…         246120     420000 Residential  
10      2006 2004-06-05 Brist…         147320     329900 Residential  
# … with 548,226 more rows, and 3 more variables:
#   residential_type <chr>, location <chr>, year <date>
Create an object of the towns with bottom ten median sale prices
median_sale_price_bottom_10 <-connecticut %>%
  group_by(town) %>%
  summarize(median_sale_price = median(sale_price)) %>%
  arrange(median_sale_price) %>%
  slice(1:10)
Create an object of the towns with the top ten median sale prices
median_sale_price_top_10 <-connecticut %>%
  group_by(town) %>%
  summarize(median_sale_price = median(sale_price)) %>%
  arrange(desc(median_sale_price)) %>%
  slice(1:10)
View top and bottom 10 towns
median_sale_price_bottom_10
# A tibble: 10 × 2
   town          median_sale_price
   <chr>                     <dbl>
 1 Waterbury                 94300
 2 Norwich                  126140
 3 Torrington               128500
 4 Hartford                 130000
 5 Windham                  130000
 6 New Britain              134000
 7 East Hartford            140000
 8 Winchester               141000
 9 New London               145000
10 Bridgeport               146000
median_sale_price_top_10
# A tibble: 10 × 2
   town       median_sale_price
   <chr>                  <dbl>
 1 Greenwich            1335000
 2 Darien               1300000
 3 New Canaan           1242591
 4 Westport             1100000
 5 Weston                799000
 6 Wilton                741000
 7 Ridgefield            614625
 8 Roxbury               600000
 9 Easton                577517
10 Redding               541975
Create Vector of top 10 towns
top_10 <- median_sale_price_top_10$town
Create a line graph of the top ten towns with the highest median sale prices
connecticut %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(median_yearly_sale_price = median(sale_price)) %>%
  filter(town %in% top_10) %>%
  ggplot(aes(x = year, y = median_yearly_sale_price)) +
  geom_line(aes(color = town)) +
  ggtitle("Towns in Connecticut with the Highest Median Sale Price: Top 10") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic()

Same as above only facet wrapped
connecticut %>%
  group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
  summarize(median_yearly_sale_price = median(sale_price)) %>%
  filter(town %in% top_10) %>%
  ggplot(aes(x = year, y = median_yearly_sale_price)) +
  geom_line(aes(color = town)) +
  ggtitle("Towns With Highest Median Sale Price: Top 10") +
  ylab("Sale Price") +
  xlab("Year") +
  theme_classic() +
  facet_wrap(vars(town))

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Farrell (2022, Feb. 27). Data Analytics and Computational Social Science: Homework 4. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211870094/

BibTeX citation

@misc{farrell2022homework,
  author = {Farrell, Joseph},
  title = {Data Analytics and Computational Social Science: Homework 4},
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211870094/},
  year = {2022}
}