Connecticut Estate Sales Data
connecticut <- read_csv("/Users/nelsonfarrell/Downloads/Real_Estate_Sales_2001-2019_GL.csv")
connecticut
str(connecticut)
spec_tbl_df [930,621 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Serial Number : num [1:930621] 141466 140604 14340 140455 141195 ...
$ List Year : num [1:930621] 2014 2014 2014 2014 2014 ...
$ Date Recorded : chr [1:930621] "08/06/2015" "06/29/2015" "07/01/2015" "04/30/2015" ...
$ Town : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
$ Address : chr [1:930621] "83 OVERBROOK DRIVE" "56 HIGHVIEW LANE" "32 OVERLOOK DR" "171 BRADFORD WALK" ...
$ Assessed Value : num [1:930621] 503270 86030 351880 204680 229330 ...
$ Sale Amount : num [1:930621] 850000 149900 570000 261000 250000 ...
$ Sales Ratio : num [1:930621] 0.592 0.574 0.617 0.784 0.917 ...
$ Property Type : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
$ Residential Type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
$ Non Use Code : chr [1:930621] NA NA NA NA ...
$ Assessor Remarks: chr [1:930621] NA NA NA NA ...
$ OPM remarks : chr [1:930621] NA NA NA NA ...
$ Location : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
- attr(*, "spec")=
.. cols(
.. `Serial Number` = col_double(),
.. `List Year` = col_double(),
.. `Date Recorded` = col_character(),
.. Town = col_character(),
.. Address = col_character(),
.. `Assessed Value` = col_double(),
.. `Sale Amount` = col_double(),
.. `Sales Ratio` = col_double(),
.. `Property Type` = col_character(),
.. `Residential Type` = col_character(),
.. `Non Use Code` = col_character(),
.. `Assessor Remarks` = col_character(),
.. `OPM remarks` = col_character(),
.. Location = col_character()
.. )
- attr(*, "problems")=<externalptr>
Serial number
, Non Use Code
, Assessor Remarks
, Sales Ratio
, and OPM Remarks
connecticut <- select(connecticut,
"List Year",
"Date Recorded",
"Town",
"Assessed Value",
"Sale Amount",
"Property Type",
"Residential Type",
"Location")
head(connecticut) #view first 6 rows of `connecticut`
# A tibble: 6 × 8
`List Year` `Date Recorded` Town `Assessed Value` `Sale Amount`
<dbl> <chr> <chr> <dbl> <dbl>
1 2014 08/06/2015 Stamford 503270 850000
2 2014 06/29/2015 New Haven 86030 149900
3 2014 07/01/2015 Ridgefie… 351880 570000
4 2014 04/30/2015 New Brit… 204680 261000
5 2014 06/26/2015 Stamford 229330 250000
6 2014 06/29/2015 West Har… 356300 600000
# … with 3 more variables: `Property Type` <chr>,
# `Residential Type` <chr>, Location <chr>
connecticut <- rename(connecticut,
"list_year" = "List Year",
"sale_date" = "Date Recorded",
"assessed_price" = "Assessed Value",
"sale_price" = "Sale Amount",
"property_type" = "Property Type",
"residential_type" = "Residential Type",
"location" = "Location",
"town" = "Town")
List and description of remaining variables
list_year
: numeric, year the property was listedsale_date
: Date, date the property was soldassessed_price
: numeric, price used in tax evaluationsale_price
: numeric, price the property sold forproperty_type
: character, nominal, type of property (i.e., residential, commercial, etc.)residential_type
: character, nominal, if residential, type of residential (i.e., single family, two family, etc.)location
: character, nominal, location of property in lat./long. (maybe useful to create a map later)town
: character, nominal, town location propertyPotential Research Questions
head(connecticut)
# A tibble: 6 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <chr> <chr> <dbl> <dbl> <chr>
1 2014 08/06/2015 Stamfo… 503270 850000 Residential
2 2014 06/29/2015 New Ha… 86030 149900 Residential
3 2014 07/01/2015 Ridgef… 351880 570000 Residential
4 2014 04/30/2015 New Br… 204680 261000 Residential
5 2014 06/26/2015 Stamfo… 229330 250000 Residential
6 2014 06/29/2015 West H… 356300 600000 Residential
# … with 2 more variables: residential_type <chr>, location <chr>
tail(connecticut)
# A tibble: 6 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <chr> <chr> <dbl> <dbl> <chr>
1 2014 02/19/2015 Hamden 220080 402500 Residential
2 2014 12/18/2014 Danbury 248500 428000 Residential
3 2014 11/20/2014 Naugat… 175910 252500 Residential
4 2014 01/21/2015 Bridge… 723600 466667 Commercial
5 2014 06/18/2015 Madison 409200 580000 Residential
6 2014 04/22/2015 Cheshi… 379100 520000 Residential
# … with 2 more variables: residential_type <chr>, location <chr>
sale_date
from character to date
tibble [930,621 × 8] (S3: tbl_df/tbl/data.frame)
$ list_year : num [1:930621] 2014 2014 2014 2014 2014 ...
$ sale_date : Date[1:930621], format: "2015-08-06" ...
$ town : chr [1:930621] "Stamford" "New Haven" "Ridgefield" "New Britain" ...
$ assessed_price : num [1:930621] 503270 86030 351880 204680 229330 ...
$ sale_price : num [1:930621] 850000 149900 570000 261000 250000 ...
$ property_type : chr [1:930621] "Residential" "Residential" "Residential" "Residential" ...
$ residential_type: chr [1:930621] "Single Family" "Single Family" "Single Family" "Condo" ...
$ location : chr [1:930621] "POINT (-73.525969 41.081897)" "POINT (-72.878115 41.30285)" "POINT (-73.508273 41.286223)" "POINT (-72.775592 41.713335)" ...
property_type
variable
tally(~property_type, data = connecticut)
property_type
A Apartments C C/I/U
2210 2922 3 11913
Commercial Condo Four Family Industrial
19404 9932 237 2551
Public Utility R Residential Single Family
66 290287 494147 40518
Three Family Two Family UA V
1228 2411 11 24479
Vacant Land <NA>
23414 4888
property_type
for Residential
, Single Family
, Two Family
, Three Family
, Condo
connecticut
head(connecticut, n = 75)
# A tibble: 75 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2015-08-06 Stamf… 503270 850000 Residential
2 2014 2015-06-29 New H… 86030 149900 Residential
3 2014 2015-07-01 Ridge… 351880 570000 Residential
4 2014 2015-04-30 New B… 204680 261000 Residential
5 2014 2015-06-26 Stamf… 229330 250000 Residential
6 2014 2015-06-29 West … 356300 600000 Residential
7 2014 2014-10-17 Willi… 236060 332900 Residential
8 2014 2015-08-05 Trumb… 183900 240000 Residential
9 2014 2015-09-01 Water… 89610 75500 Residential
10 2014 2014-12-02 Water… 75840 91000 Residential
# … with 65 more rows, and 2 more variables: residential_type <chr>,
# location <chr>
tail(connecticut, n = 75)
# A tibble: 75 × 8
list_year sale_date town assessed_price sale_price property_type
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2014 2014-10-09 Groton 155610 83000 Residential
2 2014 2015-04-01 Manch… 112300 164800 Residential
3 2014 2015-07-30 Farmi… 449170 710000 Residential
4 2014 2015-04-28 Harwi… 146430 185000 Residential
5 2014 2015-05-19 Chesh… 266540 342000 Residential
6 2014 2015-09-21 Darien 549990 3395000 Residential
7 2014 2015-06-26 Hamden 148470 175000 Residential
8 2014 2015-07-21 Killi… 73850 124900 Residential
9 2014 2014-12-11 Brook… 240180 395000 Residential
10 2014 2015-07-16 Merid… 114170 108500 Residential
# … with 65 more rows, and 2 more variables: residential_type <chr>,
# location <chr>
assessed_price
and sale_price
summarize(connecticut,
mean_assessed_price = mean(assessed_price),
mean_sale_price = mean(sale_price),
median_assessed_price = median(assessed_price),
median_sale_price = median(sale_price),
sd_assessed_price =sd(assessed_price),
sd_sale_price = sd(sale_price))
# A tibble: 1 × 6
mean_assessed_pri… mean_sale_price median_assessed… median_sale_pri…
<dbl> <dbl> <dbl> <dbl>
1 247960. 349358. 159755 230000
# … with 2 more variables: sd_assessed_price <dbl>,
# sd_sale_price <dbl>
assessed_price
favstats(~assessed_price, data = connecticut)
min Q1 median Q3 max mean sd n missing
0 111000 159755 247270 881510000 247959.7 1292161 548236 0
sale_price
min Q1 median Q3 max mean sd n missing
0 147500 230000 360000 120000000 349357.8 636067.5 548236 0
property_type
Condo Residential Single Family Three Family Two Family
9932 494147 40518 1228 2411
# A tibble: 5 × 2
property_type n
<chr> <int>
1 Condo 9932
2 Residential 494147
3 Single Family 40518
4 Three Family 1228
5 Two Family 2411
property type
prop.table(table(select(connecticut, property_type)))
Condo Residential Single Family Three Family Two Family
0.018116286 0.901339934 0.073906128 0.002239911 0.004397741
town
variable
# A tibble: 170 × 2
town n
<chr> <int>
1 Bridgeport 18239
2 Waterbury 15533
3 Stamford 14678
4 Norwalk 12698
5 Fairfield 11340
6 West Hartford 10750
7 Danbury 10588
8 Greenwich 10559
9 Hamden 10318
10 New Haven 10133
# … with 160 more rows
town
variable
options(scipen = 999) #remove scientific notation
prop.table(table(select(connecticut, town)))
***Unknown*** Andover Ansonia Ashford
0.000001824032 0.000835406650 0.004607504797 0.001194740951
Avon Barkhamsted Beacon Falls Berlin
0.007234110857 0.000906543897 0.001935297937 0.005915335731
Bethany Bethel Bethlehem Bloomfield
0.001324247222 0.005667267381 0.000793453914 0.005625314646
Bolton Bozrah Branford Bridgeport
0.001406328661 0.000512552988 0.009603528407 0.033268519397
Bridgewater Bristol Brookfield Brooklyn
0.000541737500 0.016241180805 0.005570593686 0.002657614604
Burlington Canaan Canterbury Canton
0.003239480808 0.000383046717 0.001364375926 0.003553214309
Chaplin Cheshire Chester Clinton
0.000516201052 0.008085933795 0.001008689688 0.005059864730
Colchester Colebrook Columbia Cornwall
0.004158792929 0.000443239773 0.001455577525 0.000465128156
Coventry Cromwell Danbury Darien
0.003947205218 0.004755251388 0.019312850670 0.007179389898
Deep River Derby Durham East Granby
0.001459225589 0.003368987079 0.002115877104 0.002083044528
East Haddam East Hampton East Hartford East Haven
0.003026069065 0.004981431354 0.014707169905 0.008452564224
East Lyme East Windsor Eastford Easton
0.006807287373 0.003575102693 0.000390342845 0.002571885101
Ellington Enfield Essex Fairfield
0.004799028156 0.011383783626 0.002588301388 0.020684522724
Farmington Franklin Glastonbury Goshen
0.008974237372 0.000397638973 0.012649661824 0.001293238678
Granby Greenwich Griswold Groton
0.003463836742 0.019259953743 0.003319738215 0.009545159384
Guilford Haddam Hamden Hampton
0.007166621674 0.002570061069 0.018820362034 0.000454183965
Hartford Hartland Harwinton Hebron
0.013371978491 0.000490664604 0.001384440278 0.002467915277
Kent Killingly Killingworth Lebanon
0.001050642424 0.006103211026 0.001920705681 0.002103108880
Ledyard Lisbon Litchfield Lyme
0.005395486615 0.001067058712 0.002497099789 0.000802574074
Madison Manchester Mansfield Marlborough
0.006544626767 0.014645152817 0.004071239393 0.001906113426
Meriden Middlebury Middlefield Middletown
0.017324655805 0.002394953998 0.001081650968 0.012003954501
Milford Monroe Montville Morris
0.016611459299 0.006571987246 0.005152890361 0.000735084890
Naugatuck New Britain New Canaan New Fairfield
0.008742585310 0.016055129543 0.007237758921 0.003974565698
New Hartford New Haven New London New Milford
0.002104932912 0.018482916116 0.006210828913 0.009238722010
Newington Newtown Norfolk North Branford
0.010900415150 0.007660934342 0.000567273948 0.003533149957
North Canaan North Haven North Stonington Norwalk
0.000777037626 0.007130141034 0.001482938005 0.023161558161
Norwich Old Lyme Old Saybrook Orange
0.011091938508 0.003206648232 0.004394093055 0.003241304840
Oxford Plainfield Plainville Plymouth
0.003976389730 0.004122312289 0.005346237752 0.003565982533
Pomfret Portland Preston Prospect
0.001094419192 0.003296025799 0.001313303030 0.003166519528
Putnam Redding Ridgefield Rocky Hill
0.002965876010 0.002455147053 0.008383251009 0.005731108501
Roxbury Salem Salisbury Scotland
0.000811694234 0.001302358838 0.001568667508 0.000404935101
Seymour Sharon Shelton Sherman
0.005255036152 0.001140019991 0.012691614560 0.001265878198
Simsbury Somers South Windsor Southbury
0.007774024325 0.002783472811 0.008558358079 0.008458036320
Southington Sprague Stafford Stamford
0.014865860688 0.000735084890 0.003462012710 0.026773141494
Sterling Stonington Stratford Suffield
0.001130899831 0.006117803282 0.016883240064 0.004490766750
Thomaston Thompson Tolland Torrington
0.002145061616 0.002754288299 0.003834115235 0.013375626555
Trumbull Union Vernon Voluntown
0.010712539855 0.000184227231 0.008335826177 0.000795277946
Wallingford Warren Washington Waterbury
0.015137641454 0.000474248316 0.001530362836 0.028332688842
Waterford Watertown West Hartford West Haven
0.006809111405 0.005497632406 0.019608343852 0.013371978491
Westbrook Weston Westport Wethersfield
0.002624782028 0.003347098695 0.010915007406 0.008211792002
Willington Wilton Winchester Windham
0.001130899831 0.005380894359 0.003514909637 0.005324349368
Windsor Windsor Locks Wolcott Woodbridge
0.008587542591 0.004317483711 0.004704178492 0.002683151052
Woodbury Woodstock
0.003044309385 0.002850961994
property_types
connecticut %>%
group_by(property_type) %>%
summarize(mean_sale_price_type = mean(sale_price),
median_sale_price_type = median(sale_price),
sd_sale_price_type = sd(sale_price))
# A tibble: 5 × 4
property_type mean_sale_price_type median_sale_pri… sd_sale_price_t…
<chr> <dbl> <dbl> <dbl>
1 Condo 238111. 177900 236357.
2 Residential 346711. 226000 645755.
3 Single Family 418460. 280000 602036.
4 Three Family 240900. 225000 157237.
5 Two Family 243985. 205000 219448.
sale price
, check for outliers
connecticut %>%
group_by(property_type) %>%
ggplot(aes(x = property_type, y = sale_price)) +
geom_violin(notch = TRUE) +
labs(title = "Distribution of Sale Price for Property Types",
x = "Property Type",
y = "Sale Price") +
theme_light()
sale price
for under $2,500,000
connecticut %>%
filter(`sale_price` < 2500000) %>%
group_by(property_type) %>%
ggplot(aes(x = property_type, y = sale_price)) +
geom_violin(notch = TRUE) +
labs(title = "Distribution of Sale Price Under $2,500,000",
x = "Property Type",
y = "Sale Price") +
theme_bw()
sale price
of each town in Connecticut
connecticut %>%
group_by(town) %>%
summarize(median_sale_price = median(sale_price)) %>%
ggplot(aes(x = town, y = median_sale_price)) +
geom_bar(stat = "identity") +
labs(title = "Median Sale Price of Towns in Connecticut",
y = "Price",
x = "Town") +
theme_classic()
town
and median_sale_price
.connecticut %>%
group_by(year = lubridate::floor_date(sale_date, "year")) %>%
summarize(median_yearly_sale_price = median(sale_price)) %>%
arrange()
# A tibble: 20 × 2
year median_yearly_sale_price
<date> <dbl>
1 1999-01-01 95000
2 2001-01-01 425856.
3 2003-01-01 158900
4 2004-01-01 286900
5 2005-01-01 150000
6 2006-01-01 260000
7 2007-01-01 270000
8 2008-01-01 240000
9 2009-01-01 215000
10 2010-01-01 222100
11 2011-01-01 210000
12 2012-01-01 220000
13 2013-01-01 220500
14 2014-01-01 210000
15 2015-01-01 214000
16 2016-01-01 215000
17 2017-01-01 227000
18 2018-01-01 225000
19 2019-01-01 230000
20 2020-01-01 265000
year
connecticut %>%
group_by(year = lubridate::floor_date(sale_date, "year")) %>%
arrange(year)
# A tibble: 548,236 × 9
# Groups: year [20]
list_year sale_date town assessed_price sale_price property_type
<dbl> <date> <chr> <dbl> <dbl> <chr>
1 2017 1999-04-05 New L… 46690 95000 Residential
2 2017 2001-08-23 Shelt… 427910 558000 Residential
3 2017 2001-08-01 Shelt… 390810 635000 Residential
4 2006 2001-09-04 Newin… 64750 88000 Residential
5 2006 2001-10-27 Green… 573930 660000 Residential
6 2006 2001-09-25 Nauga… 183850 293711 Residential
7 2006 2001-09-05 Newin… 166110 256000 Residential
8 2006 2003-07-17 Nauga… 84200 158900 Residential
9 2006 2004-07-31 Newin… 246120 420000 Residential
10 2006 2004-06-05 Brist… 147320 329900 Residential
# … with 548,226 more rows, and 3 more variables:
# residential_type <chr>, location <chr>, year <date>
median_sale_price_bottom_10
# A tibble: 10 × 2
town median_sale_price
<chr> <dbl>
1 Waterbury 94300
2 Norwich 126140
3 Torrington 128500
4 Hartford 130000
5 Windham 130000
6 New Britain 134000
7 East Hartford 140000
8 Winchester 141000
9 New London 145000
10 Bridgeport 146000
median_sale_price_top_10
# A tibble: 10 × 2
town median_sale_price
<chr> <dbl>
1 Greenwich 1335000
2 Darien 1300000
3 New Canaan 1242591
4 Westport 1100000
5 Weston 799000
6 Wilton 741000
7 Ridgefield 614625
8 Roxbury 600000
9 Easton 577517
10 Redding 541975
top_10 <- median_sale_price_top_10$town
connecticut %>%
group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
summarize(median_yearly_sale_price = median(sale_price)) %>%
filter(town %in% top_10) %>%
ggplot(aes(x = year, y = median_yearly_sale_price)) +
geom_line(aes(color = town)) +
ggtitle("Towns in Connecticut with the Highest Median Sale Price: Top 10") +
ylab("Sale Price") +
xlab("Year") +
theme_classic()
median_yearly sale price
, year
, and town
.yearly_median_sale_price
going up the fastest.yearly_median_sale_price
is an aggregate of all sales in that year.property_type
first and look at each property type separately.connecticut %>%
group_by(town, year = lubridate::floor_date(sale_date, "year")) %>%
summarize(median_yearly_sale_price = median(sale_price)) %>%
filter(town %in% top_10) %>%
ggplot(aes(x = year, y = median_yearly_sale_price)) +
geom_line(aes(color = town)) +
ggtitle("Towns With Highest Median Sale Price: Top 10") +
ylab("Sale Price") +
xlab("Year") +
theme_classic() +
facet_wrap(vars(town))
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Farrell (2022, Feb. 27). Data Analytics and Computational Social Science: Homework 4. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211870094/
BibTeX citation
@misc{farrell2022homework, author = {Farrell, Joseph}, title = {Data Analytics and Computational Social Science: Homework 4}, url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211870094/}, year = {2022} }