Connecticut Estate Sales Data
connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
connecticut
str(connecticut)
spec_tbl_df [930,621 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Serial Number : num [1:930621] 141466 140604 14340 140455 141195 ...
$ List Year : num [1:930621] 2014 2014 2014 2014 2014 ...
$ Date Recorded : chr [1:930621] "08/06/2015" "06/29/2015" "07/01/2015" "04/30/2015" ...
$ Town : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
$ Address : chr [1:930621] "83 OVERBROOK DRIVE" "56 HIGHVIEW LANE" "32 OVERLOOK DR" "171 BRADFORD WALK" ...
$ Assessed Value : num [1:930621] 503270 86030 351880 204680 229330 ...
$ Sale Amount : num [1:930621] 850000 149900 570000 261000 250000 ...
$ Sales Ratio : num [1:930621] 0.592 0.574 0.617 0.784 0.917 ...
$ Property Type : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
$ Residential Type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
$ Non Use Code : chr [1:930621] NA NA NA NA ...
$ Assessor Remarks: chr [1:930621] NA NA NA NA ...
$ OPM remarks : chr [1:930621] NA NA NA NA ...
$ Location : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
- attr(*, "spec")=
.. cols(
.. `Serial Number` = col_double(),
.. `List Year` = col_double(),
.. `Date Recorded` = col_character(),
.. Town = col_character(),
.. Address = col_character(),
.. `Assessed Value` = col_double(),
.. `Sale Amount` = col_double(),
.. `Sales Ratio` = col_double(),
.. `Property Type` = col_character(),
.. `Residential Type` = col_character(),
.. `Non Use Code` = col_character(),
.. `Assessor Remarks` = col_character(),
.. `OPM remarks` = col_character(),
.. Location = col_character()
.. )
- attr(*, "problems")=<externalptr>
Serial number
, Non Use Code
, Assessor Remarks
, Sales Ratio
, and OPM Remarks
connecticut <- select(connecticut,
"List Year",
"Date Recorded",
"Town",
"Assessed Value",
"Sale Amount",
"Property Type",
"Residential Type",
"Location")
head(connecticut) #view first 6 rows of `connecticut`
# A tibble: 6 × 8
`List Year` `Date Recorded` Town `Assessed Value` `Sale Amount`
<dbl> <chr> <chr> <dbl> <dbl>
1 2014 08/06/2015 Stamford 503270 850000
2 2014 06/29/2015 New Haven 86030 149900
3 2014 07/01/2015 Ridgefie… 351880 570000
4 2014 04/30/2015 New Brit… 204680 261000
5 2014 06/26/2015 Stamford 229330 250000
6 2014 06/29/2015 West Har… 356300 600000
# … with 3 more variables: `Property Type` <chr>,
# `Residential Type` <chr>, Location <chr>
connecticut <- rename(connecticut,
"list_year" = "List Year",
"sale_date" = "Date Recorded",
"assessed_price" = "Assessed Value",
"sale_price" = "Sale Amount",
"property_type" = "Property Type",
"residential_type" = "Residential Type",
"location" = "Location",
"town" = "Town")
List and description of remaining variables
list_year
: numeric, year the property was listedsale_date
: Date, date the property was soldassessed_price
: numeric, price used in tax evaluationsale_price
: numeric, price the property sold forproperty_type
: character, nominal, type of property (i.e., residential, commercial, etc.)residential_type
: character, nominal, if residential, type of residential (i.e., single family, two family, etc.)location
: character, nominal, location of property in lat./long. (maybe useful to create a map later)town
: character, nominal, town location propertyPotential Research Questions
head(connecticut)
# A tibble: 6 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <chr> <chr> <dbl> <dbl> <chr>
1 2014 08/06/2015 Stamfo… 503270 850000 Residential
2 2014 06/29/2015 New Ha… 86030 149900 Residential
3 2014 07/01/2015 Ridgef… 351880 570000 Residential
4 2014 04/30/2015 New Br… 204680 261000 Residential
5 2014 06/26/2015 Stamfo… 229330 250000 Residential
6 2014 06/29/2015 West H… 356300 600000 Residential
# … with 2 more variables: residential_type <chr>, location <chr>
tail(connecticut)
# A tibble: 6 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <chr> <chr> <dbl> <dbl> <chr>
1 2014 02/19/2015 Hamden 220080 402500 Residential
2 2014 12/18/2014 Danbury 248500 428000 Residential
3 2014 11/20/2014 Naugat… 175910 252500 Residential
4 2014 01/21/2015 Bridge… 723600 466667 Commercial
5 2014 06/18/2015 Madison 409200 580000 Residential
6 2014 04/22/2015 Cheshi… 379100 520000 Residential
# … with 2 more variables: residential_type <chr>, location <chr>
sale_date
from character to date
tibble [930,621 × 8] (S3: tbl_df/tbl/data.frame)
$ list_year : num [1:930621] 2014 2014 2014 2014 2014 ...
$ sale_date : Date[1:930621], format: "2015-08-06" ...
$ town : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
$ assessed_price : num [1:930621] 503270 86030 351880 204680 229330 ...
$ sale_price : num [1:930621] 850000 149900 570000 261000 250000 ...
$ property_type : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
$ residential_type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
$ location : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
property_type
variable
tally(~property_type, data = connecticut)
property_type
A Apartments C C/I/U
2210 2922 3 11913
Commercial Condo Four Family Industrial
19404 9932 237 2551
Public Utility R Residential Single Family
66 290287 494147 40518
Three Family Two Family UA V
1228 2411 11 24479
Vacant Land <NA>
23414 4888
property_type
for Residential
, Single Family
, Two Family
, Three Family
, Condo
connecticut
head(connecticut, n = 75)
# A tibble: 75 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2015-08-06 Stamf… 503270 850000 Residential
2 2014 2015-06-29 New H… 86030 149900 Residential
3 2014 2015-07-01 Ridge… 351880 570000 Residential
4 2014 2015-04-30 New B… 204680 261000 Residential
5 2014 2015-06-26 Stamf… 229330 250000 Residential
6 2014 2015-06-29 West … 356300 600000 Residential
7 2014 2014-10-17 Willi… 236060 332900 Residential
8 2014 2015-08-05 Trumb… 183900 240000 Residential
9 2014 2015-09-01 Water… 89610 75500 Residential
10 2014 2014-12-02 Water… 75840 91000 Residential
# … with 65 more rows, and 2 more variables: residential_type <chr>,
# location <chr>
tail(connecticut, n = 75)
# A tibble: 75 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2014-10-09 Groton 155610 83000 Residential
2 2014 2015-04-01 Manch… 112300 164800 Residential
3 2014 2015-07-30 Farmi… 449170 710000 Residential
4 2014 2015-04-28 Harwi… 146430 185000 Residential
5 2014 2015-05-19 Chesh… 266540 342000 Residential
6 2014 2015-09-21 Darien 549990 3395000 Residential
7 2014 2015-06-26 Hamden 148470 175000 Residential
8 2014 2015-07-21 Killi… 73850 124900 Residential
9 2014 2014-12-11 Brook… 240180 395000 Residential
10 2014 2015-07-16 Merid… 114170 108500 Residential
# … with 65 more rows, and 2 more variables: residential_type <chr>,
# location <chr>
assessed_price
and sale_price
summarize(connecticut,
mean_assessed_price = mean(assessed_price),
mean_sale_price = mean(sale_price),
median_assessed_price = median(assessed_price),
median_sale_price = median(sale_price),
sd_assessed_price =sd(assessed_price),
sd_sale_price = sd(sale_price))
# A tibble: 1 × 6
mean_assessed_pri… mean_sale_price median_assessed… median_sale_pri…
<dbl> <dbl> <dbl> <dbl>
1 247960. 349358. 159755 230000
# … with 2 more variables: sd_assessed_price <dbl>,
# sd_sale_price <dbl>
assessed_price
favstats(~assessed_price, data = connecticut)
min Q1 median Q3 max mean sd n missing
0 111000 159755 247270 881510000 247959.7 1292161 548236 0
sale_price
min Q1 median Q3 max mean sd n missing
0 147500 230000 360000 120000000 349357.8 636067.5 548236 0
property_type
Condo Residential Single Family Three Family Two Family
9932 494147 40518 1228 2411
# A tibble: 5 × 2
property_type n
<chr> <int>
1 Condo 9932
2 Residential 494147
3 Single Family 40518
4 Three Family 1228
5 Two Family 2411
property type
prop.table(table(select(connecticut, property_type)))
Condo Residential Single Family Three Family Two Family
0.018116286 0.901339934 0.073906128 0.002239911 0.004397741
town
variable
# A tibble: 170 × 2
town n
<chr> <int>
1 Bridgeport 18239
2 Waterbury 15533
3 Stamford 14678
4 Norwalk 12698
5 Fairfield 11340
6 West Hartford 10750
7 Danbury 10588
8 Greenwich 10559
9 Hamden 10318
10 New Haven 10133
# … with 160 more rows
town
variable
options(scipen = 999) #remove scientific notation
prop.table(table(select(connecticut, town)))
***Unknown*** Andover Ansonia Ashford
0.000001824032 0.000835406650 0.004607504797 0.001194740951
Avon Barkhamsted Beacon Falls Berlin
0.007234110857 0.000906543897 0.001935297937 0.005915335731
Bethany Bethel Bethlehem Bloomfield
0.001324247222 0.005667267381 0.000793453914 0.005625314646
Bolton Bozrah Branford Bridgeport
0.001406328661 0.000512552988 0.009603528407 0.033268519397
Bridgewater Bristol Brookfield Brooklyn
0.000541737500 0.016241180805 0.005570593686 0.002657614604
Burlington Canaan Canterbury Canton
0.003239480808 0.000383046717 0.001364375926 0.003553214309
Chaplin Cheshire Chester Clinton
0.000516201052 0.008085933795 0.001008689688 0.005059864730
Colchester Colebrook Columbia Cornwall
0.004158792929 0.000443239773 0.001455577525 0.000465128156
Coventry Cromwell Danbury Darien
0.003947205218 0.004755251388 0.019312850670 0.007179389898
Deep River Derby Durham East Granby
0.001459225589 0.003368987079 0.002115877104 0.002083044528
East Haddam East Hampton East Hartford East Haven
0.003026069065 0.004981431354 0.014707169905 0.008452564224
East Lyme East Windsor Eastford Easton
0.006807287373 0.003575102693 0.000390342845 0.002571885101
Ellington Enfield Essex Fairfield
0.004799028156 0.011383783626 0.002588301388 0.020684522724
Farmington Franklin Glastonbury Goshen
0.008974237372 0.000397638973 0.012649661824 0.001293238678
Granby Greenwich Griswold Groton
0.003463836742 0.019259953743 0.003319738215 0.009545159384
Guilford Haddam Hamden Hampton
0.007166621674 0.002570061069 0.018820362034 0.000454183965
Hartford Hartland Harwinton Hebron
0.013371978491 0.000490664604 0.001384440278 0.002467915277
Kent Killingly Killingworth Lebanon
0.001050642424 0.006103211026 0.001920705681 0.002103108880
Ledyard Lisbon Litchfield Lyme
0.005395486615 0.001067058712 0.002497099789 0.000802574074
Madison Manchester Mansfield Marlborough
0.006544626767 0.014645152817 0.004071239393 0.001906113426
Meriden Middlebury Middlefield Middletown
0.017324655805 0.002394953998 0.001081650968 0.012003954501
Milford Monroe Montville Morris
0.016611459299 0.006571987246 0.005152890361 0.000735084890
Naugatuck New Britain New Canaan New Fairfield
0.008742585310 0.016055129543 0.007237758921 0.003974565698
New Hartford New Haven New London New Milford
0.002104932912 0.018482916116 0.006210828913 0.009238722010
Newington Newtown Norfolk North Branford
0.010900415150 0.007660934342 0.000567273948 0.003533149957
North Canaan North Haven North Stonington Norwalk
0.000777037626 0.007130141034 0.001482938005 0.023161558161
Norwich Old Lyme Old Saybrook Orange
0.011091938508 0.003206648232 0.004394093055 0.003241304840
Oxford Plainfield Plainville Plymouth
0.003976389730 0.004122312289 0.005346237752 0.003565982533
Pomfret Portland Preston Prospect
0.001094419192 0.003296025799 0.001313303030 0.003166519528
Putnam Redding Ridgefield Rocky Hill
0.002965876010 0.002455147053 0.008383251009 0.005731108501
Roxbury Salem Salisbury Scotland
0.000811694234 0.001302358838 0.001568667508 0.000404935101
Seymour Sharon Shelton Sherman
0.005255036152 0.001140019991 0.012691614560 0.001265878198
Simsbury Somers South Windsor Southbury
0.007774024325 0.002783472811 0.008558358079 0.008458036320
Southington Sprague Stafford Stamford
0.014865860688 0.000735084890 0.003462012710 0.026773141494
Sterling Stonington Stratford Suffield
0.001130899831 0.006117803282 0.016883240064 0.004490766750
Thomaston Thompson Tolland Torrington
0.002145061616 0.002754288299 0.003834115235 0.013375626555
Trumbull Union Vernon Voluntown
0.010712539855 0.000184227231 0.008335826177 0.000795277946
Wallingford Warren Washington Waterbury
0.015137641454 0.000474248316 0.001530362836 0.028332688842
Waterford Watertown West Hartford West Haven
0.006809111405 0.005497632406 0.019608343852 0.013371978491
Westbrook Weston Westport Wethersfield
0.002624782028 0.003347098695 0.010915007406 0.008211792002
Willington Wilton Winchester Windham
0.001130899831 0.005380894359 0.003514909637 0.005324349368
Windsor Windsor Locks Wolcott Woodbridge
0.008587542591 0.004317483711 0.004704178492 0.002683151052
Woodbury Woodstock
0.003044309385 0.002850961994
property_types
connecticut %>%
group_by(property_type) %>%
summarize(mean_sale_price_type = mean(sale_price),
median_sale_price_type = median(sale_price),
sd_sale_price_type = sd(sale_price))
# A tibble: 5 × 4
property_type mean_sale_price_type median_sale_pri… sd_sale_price_t…
<chr> <dbl> <dbl> <dbl>
1 Condo 238111. 177900 236357.
2 Residential 346711. 226000 645755.
3 Single Family 418460. 280000 602036.
4 Three Family 240900. 225000 157237.
5 Two Family 243985. 205000 219448.
sale price
, check for outliers
connecticut %>%
group_by(property_type) %>%
ggplot(aes(x = property_type, y = sale_price)) +
geom_violin(notch = TRUE) +
labs(title = "Distribution of Sale Price for Property Types",
x = "Property Type",
y = "Sale Price") +
theme_light()
sale price
for under $2,500,000
connecticut %>%
filter(sale_price < 2500000) %>%
group_by(property_type) %>%
ggplot(aes(x = property_type, y = sale_price)) +
geom_violin(notch = TRUE) +
labs(title = "Distribution of Sale Price Under $2,500,000",
x = "Property Type",
y = "Sale Price") +
theme_bw()
sale price
of each town in Connecticut
connecticut %>%
group_by(town) %>%
summarize(median_sale_price = median(sale_price)) %>%
ggplot(aes(x = town, y = median_sale_price)) +
geom_bar(stat = "identity") +
labs(title = "Median Sale Price of Towns in Connecticut",
y = "Price",
x = "Town") +
theme_classic()
town
and median_sale_price
.connecticut %>%
group_by(year = lubridate::floor_date(sale_date, "year")) %>%
summarize(median_yearly_sale_price = median(sale_price)) %>%
arrange()
# A tibble: 20 × 2
year median_yearly_sale_price
<date> <dbl>
1 1999-01-01 95000
2 2001-01-01 425856.
3 2003-01-01 158900
4 2004-01-01 286900
5 2005-01-01 150000
6 2006-01-01 260000
7 2007-01-01 270000
8 2008-01-01 240000
9 2009-01-01 215000
10 2010-01-01 222100
11 2011-01-01 210000
12 2012-01-01 220000
13 2013-01-01 220500
14 2014-01-01 210000
15 2015-01-01 214000
16 2016-01-01 215000
17 2017-01-01 227000
18 2018-01-01 225000
19 2019-01-01 230000
20 2020-01-01 265000
year
connecticut %>%
group_by(year = lubridate::floor_date(sale_date, "year")) %>%
arrange(year)
# A tibble: 548,236 × 9
# Groups: year [20]
list_year sale_date town assessed_price sale_price property_type
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2017 1999-04-05 New L… 46690 95000 Residential
2 2017 2001-08-23 Shelt… 427910 558000 Residential
3 2017 2001-08-01 Shelt… 390810 635000 Residential
4 2006 2001-09-04 Newin… 64750 88000 Residential
5 2006 2001-10-27 Green… 573930 660000 Residential
6 2006 2001-09-25 Nauga… 183850 293711 Residential
7 2006 2001-09-05 Newin… 166110 256000 Residential
8 2006 2003-07-17 Nauga… 84200 158900 Residential
9 2006 2004-07-31 Newin… 246120 420000 Residential
10 2006 2004-06-05 Brist… 147320 329900 Residential
# … with 548,226 more rows, and 3 more variables:
# residential_type <chr>, location <chr>, year <date>
median_sale_price_bottom_10
# A tibble: 10 × 2
town median_sale_price
<chr> <dbl>
1 Waterbury 94300
2 Norwich 126140
3 Torrington 128500
4 Hartford 130000
5 Windham 130000
6 New Britain 134000
7 East Hartford 140000
8 Winchester 141000
9 New London 145000
10 Bridgeport 146000
median_sale_price_top_10
# A tibble: 10 × 2
town median_sale_price
<chr> <dbl>
1 Greenwich 1335000
2 Darien 1300000
3 New Canaan 1242591
4 Westport 1100000
5 Weston 799000
6 Wilton 741000
7 Ridgefield 614625
8 Roxbury 600000
9 Easton 577517
10 Redding 541975
top_10 <- median_sale_price_top_10$town
connecticut %>%
group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
summarize(median_yearly_sale_price = median(sale_price)) %>%
filter(town %in% top_10) %>%
ggplot(aes(x = year, y = median_yearly_sale_price)) +
geom_line(aes(color = town)) +
ggtitle("Towns in Connecticut with the Highest Median Sale Price: Top 10") +
ylab("Sale Price") +
xlab("Year") +
theme_classic()
median_yearly sale price
, year
, and town
.yearly_median_sale_price
going up the fastest.yearly_median_sale_price
is an aggregate of all sales in that year.property_type
first and look at each property type separately.connecticut %>%
group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
summarize(median_yearly_sale_price = median(sale_price)) %>%
filter(town %in% top_10) %>%
ggplot(aes(x = year, y = median_yearly_sale_price)) +
geom_line(aes(color = town)) +
ggtitle("Towns With Highest Median Sale Price: Top 10") +
ylab("Sale Price") +
xlab("Year") +
theme_classic() +
facet_wrap(vars(town))
mean_monthly_sale_price
grouped By property_type
connecticut %>%
group_by(property_type, month = lubridate::floor_date(sale_date, "month")) %>%
summarize(mean_monthy_sale_price = mean(sale_price)) %>%
filter(mean_monthy_sale_price < 5000000) %>%
ggplot(aes(x = month, y = mean_monthy_sale_price)) +
geom_point((aes(color = property_type))) +
ggtitle("Mean Monthly Sale Price of Property Types") +
ylab("Mean Sale Price") +
xlab("Time") +
theme_linedraw()
residential
goes back a number of years, so I will filter out all other property types.sale_price
for under $2,500,000connecticut2
with only residential
property_types
with a sale_price
Under $2,500,000
connecticut2 <- connecticut %>%
filter(property_type == "Residential") %>%
filter(sale_price < 2500000) %>%
select("list_year",
"sale_date",
"town",
"assessed_price",
"sale_price",
"location")
colnames(connecticut2)
[1] "list_year" "sale_date" "town"
[4] "assessed_price" "sale_price" "location"
head(connecticut2)
# A tibble: 6 × 6
list_year sale_date town assessed_price sale_price location
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2015-08-06 Stamford 503270 850000 POINT (…
2 2014 2015-06-29 New Haven 86030 149900 POINT (…
3 2014 2015-07-01 Ridgefield 351880 570000 POINT (…
4 2014 2015-04-30 New Britain 204680 261000 POINT (…
5 2014 2015-06-26 Stamford 229330 250000 POINT (…
6 2014 2015-06-29 West Hartfo… 356300 600000 POINT (…
con_map <- map_data("county",
"connecticut")
ggplot() +
geom_polygon(data = con_map,
aes(x = long, y = lat, group = group),
color = "black", fill = "slateblue1")
subregion
column
head(con_map, n = 10)
long lat group order region subregion
1 -73.53341 41.67695 1 1 connecticut fairfield
2 -73.49902 41.66549 1 2 connecticut fairfield
3 -73.49902 41.65403 1 3 connecticut fairfield
4 -73.50475 41.63684 1 4 connecticut fairfield
5 -73.49329 41.60819 1 5 connecticut fairfield
6 -73.46465 41.55090 1 6 connecticut fairfield
7 -73.44746 41.49934 1 7 connecticut fairfield
8 -73.39017 41.51079 1 8 connecticut fairfield
9 -73.37297 41.49360 1 9 connecticut fairfield
10 -73.32714 41.48214 1 10 connecticut fairfield
subregion n
1 fairfield 51
2 hartford 57
3 litchfield 32
4 middlesex 36
5 new haven 70
6 new london 46
7 tolland 34
8 windham 20
location
Column
connecticut2 <- connecticut2%>%
mutate_at("location",
str_replace,
"POINT",
"") %>%
mutate_at("location",
str_replace,
"[ ]",
"") %>%
mutate_at("location",
str_replace,
"[)]",
"")
head(connecticut2, n = 75)
# A tibble: 75 × 6
list_year sale_date town assessed_price sale_price location
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2015-08-06 Stamford 503270 850000 (-73.52…
2 2014 2015-06-29 New Haven 86030 149900 (-72.87…
3 2014 2015-07-01 Ridgefield 351880 570000 (-73.50…
4 2014 2015-04-30 New Britain 204680 261000 (-72.77…
5 2014 2015-06-26 Stamford 229330 250000 (-73.51…
6 2014 2015-06-29 West Hartf… 356300 600000 (-72.76…
7 2014 2014-10-17 Willington 236060 332900 (-72.22…
8 2014 2015-08-05 Trumbull 183900 240000 (-73.21…
9 2014 2015-09-01 Waterford 89610 75500 <NA>
10 2014 2014-12-02 Waterbury 75840 91000 (-73.04…
# … with 65 more rows
connecticut2
# A tibble: 25 × 6
list_year sale_date town assessed_price sale_price location
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2015-08-06 Stamford 503270 850000 (-73.52…
2 2014 2015-06-29 New Haven 86030 149900 (-72.87…
3 2014 2015-07-01 Ridgefield 351880 570000 (-73.50…
4 2014 2015-04-30 New Britain 204680 261000 (-72.77…
5 2014 2015-06-26 Stamford 229330 250000 (-73.51…
6 2014 2015-06-29 West Hartf… 356300 600000 (-72.76…
7 2014 2014-10-17 Willington 236060 332900 (-72.22…
8 2014 2015-08-05 Trumbull 183900 240000 (-73.21…
9 2014 2014-12-02 Waterbury 75840 91000 (-73.04…
10 2014 2015-07-01 Trumbull 178500 175000 (-73.22…
# … with 15 more rows
location
into two columns: lat
and ’long`
# A tibble: 6 × 7
list_year sale_date town assessed_price sale_price long lat
<dbl> <date> <chr> <dbl> <dbl> <chr> <chr>
1 2014 2015-08-06 Stamford 503270 850000 (-73… 41.0…
2 2014 2015-06-29 New Haven 86030 149900 (-72… 41.3…
3 2014 2015-07-01 Ridgefie… 351880 570000 (-73… 41.2…
4 2014 2015-04-30 New Brit… 204680 261000 (-72… 41.7…
5 2014 2015-06-26 Stamford 229330 250000 (-73… 41.0…
6 2014 2015-06-29 West Har… 356300 600000 (-72… 41.7…
long
column
connecticut2
: first, last 100 rows
head(connecticut2, n = 100)
# A tibble: 100 × 7
list_year sale_date town assessed_price sale_price long lat
<dbl> <date> <chr> <dbl> <dbl> <chr> <chr>
1 2014 2015-08-06 Stamford 503270 850000 -73.… 41.0…
2 2014 2015-06-29 New Hav… 86030 149900 -72.… 41.3…
3 2014 2015-07-01 Ridgefi… 351880 570000 -73.… 41.2…
4 2014 2015-04-30 New Bri… 204680 261000 -72.… 41.7…
5 2014 2015-06-26 Stamford 229330 250000 -73.… 41.0…
6 2014 2015-06-29 West Ha… 356300 600000 -72.… 41.7…
7 2014 2014-10-17 Willing… 236060 332900 -72.… 41.8…
8 2014 2015-08-05 Trumbull 183900 240000 -73.… 41.2…
9 2014 2014-12-02 Waterbu… 75840 91000 -73.… 41.5…
10 2014 2015-07-01 Trumbull 178500 175000 -73.… 41.2…
# … with 90 more rows
tail(connecticut2, n = 100)
# A tibble: 100 × 7
list_year sale_date town assessed_price sale_price long lat
<dbl> <date> <chr> <dbl> <dbl> <chr> <chr>
1 2014 2015-09-28 Bristol 257600 387000 -72.… 41.7…
2 2014 2015-02-23 Glaston… 179900 252500 -72.… 41.7…
3 2014 2014-12-15 Danbury 238800 431000 -73.… 41.4…
4 2014 2015-02-09 Berlin 191900 317000 -72.… 41.6…
5 2014 2014-10-01 Madison 172400 315000 -72.… 41.2…
6 2014 2015-02-26 East Wi… 22540 39000 -72.… 41.9…
7 2014 2015-04-06 Bridgep… 85950 55000 -73.… 41.2…
8 2014 2015-09-02 Fairfie… 436660 530000 -73.… 41.1…
9 2014 2015-05-26 Milford 180730 309000 -73.… 41.2…
10 2014 2014-11-06 Fairfie… 239190 385000 -73.… 41.1…
# … with 90 more rows
lat
and long
to numeric variables
connecticut2$long <- as.numeric(connecticut2$`long`)
connecticut2$lat <- as.numeric(connecticut2$`lat`)
connecticut2
# A tibble: 379,435 × 7
list_year sale_date town assessed_price sale_price long lat
<dbl> <date> <chr> <dbl> <dbl> <dbl> <dbl>
1 2014 2015-08-06 Stamford 503270 850000 -73.5 41.1
2 2014 2015-06-29 New Hav… 86030 149900 -72.9 41.3
3 2014 2015-07-01 Ridgefi… 351880 570000 -73.5 41.3
4 2014 2015-04-30 New Bri… 204680 261000 -72.8 41.7
5 2014 2015-06-26 Stamford 229330 250000 -73.5 41.1
6 2014 2015-06-29 West Ha… 356300 600000 -72.8 41.8
7 2014 2014-10-17 Willing… 236060 332900 -72.2 41.8
8 2014 2015-08-05 Trumbull 183900 240000 -73.2 41.2
9 2014 2014-12-02 Waterbu… 75840 91000 -73.0 41.5
10 2014 2015-07-01 Trumbull 178500 175000 -73.2 41.2
# … with 379,425 more rows
con_map
and connecticut2
at the lat
column
merged_con2 <-
left_join(connecticut2, con_map, by = "lat")
merged_con2
# A tibble: 379,435 × 12
list_year sale_date town assessed_price sale_price long.x lat
<dbl> <date> <chr> <dbl> <dbl> <dbl> <dbl>
1 2014 2015-08-06 Stamfo… 503270 850000 -73.5 41.1
2 2014 2015-06-29 New Ha… 86030 149900 -72.9 41.3
3 2014 2015-07-01 Ridgef… 351880 570000 -73.5 41.3
4 2014 2015-04-30 New Br… 204680 261000 -72.8 41.7
5 2014 2015-06-26 Stamfo… 229330 250000 -73.5 41.1
6 2014 2015-06-29 West H… 356300 600000 -72.8 41.8
7 2014 2014-10-17 Willin… 236060 332900 -72.2 41.8
8 2014 2015-08-05 Trumbu… 183900 240000 -73.2 41.2
9 2014 2014-12-02 Waterb… 75840 91000 -73.0 41.5
10 2014 2015-07-01 Trumbu… 178500 175000 -73.2 41.2
# … with 379,425 more rows, and 5 more variables: long.y <dbl>,
# group <dbl>, order <int>, region <chr>, subregion <chr>
connecticut2
that were nonsensical
lat
column in descending order
# A tibble: 321,428 × 7
list_year sale_date town assessed_price sale_price long lat
<dbl> <date> <chr> <dbl> <dbl> <dbl> <dbl>
1 2006 2007-04-24 Salisbu… 307020 405000 -73.4 42.0
2 2009 2009-12-04 Salisbu… 569280 705000 -73.4 42.0
3 2011 2012-05-25 Salisbu… 375800 502000 -73.4 42.0
4 2011 2012-04-30 Salisbu… 375000 325000 -73.4 42.0
5 2010 2010-12-20 North C… 273070 282000 -73.3 42.0
6 2013 2014-02-27 North C… 153630 80000 -73.3 42.0
7 2008 2008-10-29 North C… 129600 169000 -73.3 42.0
8 2006 2007-06-28 North C… 47180 88000 -73.3 42.0
9 2011 2012-01-05 North C… 75170 75000 -73.3 42.0
10 2017 2018-07-24 Salisbu… 547300 920000 -73.4 42.0
# … with 321,418 more rows
## ggplot() +
## geom_polygon(data = con3,
## aes(x = long,
## y = lat,
## group = town),
## color = "black", fill = "slateblue1")
sale price
for each town in 2008
sale price
for each town in 2018
mean_2008
and mean_2018
mean_2008
# A tibble: 120 × 3
# Groups: town [120]
town year mean_3
<chr> <date> <dbl>
1 Ansonia 2008-01-01 221402.
2 Avon 2008-01-01 485537.
3 Barkhamsted 2008-01-01 274033.
4 Beacon Falls 2008-01-01 312228.
5 Berlin 2008-01-01 283870.
6 Bethany 2008-01-01 436891.
7 Bethel 2008-01-01 296576.
8 Bethlehem 2008-01-01 288259.
9 Bloomfield 2008-01-01 271202.
10 Branford 2008-01-01 315394.
# … with 110 more rows
mean_2018
# A tibble: 122 × 3
# Groups: town [122]
town year mean_3
<chr> <date> <dbl>
1 Ansonia 2018-01-01 193599.
2 Avon 2018-01-01 389546.
3 Barkhamsted 2018-01-01 301730.
4 Beacon Falls 2018-01-01 214421.
5 Berlin 2018-01-01 183557
6 Bethany 2018-01-01 358050
7 Bethel 2018-01-01 325866.
8 Bethlehem 2018-01-01 302177.
9 Bloomfield 2018-01-01 210893.
10 Branford 2018-01-01 292809.
# … with 112 more rows
mean_2008
and mean_2018
and create new column Delta_mean
combined <- mean_2008 %>%
left_join(mean_2018, by = "town") %>%
mutate(delta_mean = mean_3.y - mean_3.x)
head(combined, n = 25)
# A tibble: 25 × 6
# Groups: town [25]
town year.x mean_3.x year.y mean_3.y delta_mean
<chr> <date> <dbl> <date> <dbl> <dbl>
1 Ansonia 2008-01-01 221402. 2018-01-01 193599. -27803.
2 Avon 2008-01-01 485537. 2018-01-01 389546. -95991.
3 Barkhamsted 2008-01-01 274033. 2018-01-01 301730. 27696.
4 Beacon Falls 2008-01-01 312228. 2018-01-01 214421. -97807.
5 Berlin 2008-01-01 283870. 2018-01-01 183557 -100313.
6 Bethany 2008-01-01 436891. 2018-01-01 358050 -78841.
7 Bethel 2008-01-01 296576. 2018-01-01 325866. 29290.
8 Bethlehem 2008-01-01 288259. 2018-01-01 302177. 13918.
9 Bloomfield 2008-01-01 271202. 2018-01-01 210893. -60309.
10 Branford 2008-01-01 315394. 2018-01-01 292809. -22585.
# … with 15 more rows
# A tibble: 120 × 6
# Groups: town [120]
town year.x mean_3.x year.y mean_3.y delta_mean
<chr> <date> <dbl> <date> <dbl> <dbl>
1 Weston 2008-01-01 1039698. 2018-01-01 766908. -272790.
2 Washington 2008-01-01 707150 2018-01-01 450570. -256580.
3 Bridgewater 2008-01-01 662900 2018-01-01 461750 -201150
4 Easton 2008-01-01 741237. 2018-01-01 566425. -174813.
5 Canaan 2008-01-01 360000 2018-01-01 186514. -173486.
6 New Canaan 2008-01-01 1311972. 2018-01-01 1184363. -127610.
7 Westport 2008-01-01 1211626. 2018-01-01 1101054. -110572.
8 Oxford 2008-01-01 413123. 2018-01-01 304140. -108983.
9 Berlin 2008-01-01 283870. 2018-01-01 183557 -100313.
10 Haddam 2008-01-01 354452. 2018-01-01 256267. -98184.
# … with 110 more rows
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Farrell (2022, March 6). Data Analytics and Computational Social Science: Homework 5: Incomplete. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211873035/
BibTeX citation
@misc{farrell2022homework, author = {Farrell, Joseph}, title = {Data Analytics and Computational Social Science: Homework 5: Incomplete}, url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211873035/}, year = {2022} }