Homework3

NYC Real Estate Sales Data

Joseph Farrell
2022-02-16
Load Libraries

Reading in, exploring, and cleaning dataset

Read in NYC Real Estate Data
nyc_rolling_sales <- read_csv("/Users/nelsonfarrell/Downloads/nyc-rolling-sales.csv")
Get Column Names
colnames(nyc_rolling_sales)
 [1] "...1"                           "BOROUGH"                       
 [3] "NEIGHBORHOOD"                   "BUILDING CLASS CATEGORY"       
 [5] "TAX CLASS AT PRESENT"           "BLOCK"                         
 [7] "LOT"                            "EASE-MENT"                     
 [9] "BUILDING CLASS AT PRESENT"      "ADDRESS"                       
[11] "APARTMENT NUMBER"               "ZIP CODE"                      
[13] "RESIDENTIAL UNITS"              "COMMERCIAL UNITS"              
[15] "TOTAL UNITS"                    "LAND SQUARE FEET"              
[17] "GROSS SQUARE FEET"              "YEAR BUILT"                    
[19] "TAX CLASS AT TIME OF SALE"      "BUILDING CLASS AT TIME OF SALE"
[21] "SALE PRICE"                     "SALE DATE"                     
View first 6 rows of the dataset
head(nyc_rolling_sales)
# A tibble: 6 × 22
   ...1 BOROUGH NEIGHBORHOOD  `BUILDING CLASS…` `TAX CLASS AT …` BLOCK
  <dbl>   <dbl> <chr>         <chr>             <chr>            <dbl>
1     4       1 ALPHABET CITY 07 RENTALS - WAL… 2A                 392
2     5       1 ALPHABET CITY 07 RENTALS - WAL… 2                  399
3     6       1 ALPHABET CITY 07 RENTALS - WAL… 2                  399
4     7       1 ALPHABET CITY 07 RENTALS - WAL… 2B                 402
5     8       1 ALPHABET CITY 07 RENTALS - WAL… 2A                 404
6     9       1 ALPHABET CITY 07 RENTALS - WAL… 2                  405
# … with 16 more variables: LOT <dbl>, `EASE-MENT` <lgl>,
#   `BUILDING CLASS AT PRESENT` <chr>, ADDRESS <chr>,
#   `APARTMENT NUMBER` <chr>, `ZIP CODE` <dbl>,
#   `RESIDENTIAL UNITS` <dbl>, `COMMERCIAL UNITS` <dbl>,
#   `TOTAL UNITS` <dbl>, `LAND SQUARE FEET` <chr>,
#   `GROSS SQUARE FEET` <chr>, `YEAR BUILT` <dbl>,
#   `TAX CLASS AT TIME OF SALE` <dbl>, …
Check the variables in the dataset
str(nyc_rolling_sales)
spec_tbl_df [84,548 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ...1                          : num [1:84548] 4 5 6 7 8 9 10 11 12 13 ...
 $ BOROUGH                       : num [1:84548] 1 1 1 1 1 1 1 1 1 1 ...
 $ NEIGHBORHOOD                  : chr [1:84548] "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" ...
 $ BUILDING CLASS CATEGORY       : chr [1:84548] "07 RENTALS - WALKUP APARTMENTS" "07 RENTALS - WALKUP APARTMENTS" "07 RENTALS - WALKUP APARTMENTS" "07 RENTALS - WALKUP APARTMENTS" ...
 $ TAX CLASS AT PRESENT          : chr [1:84548] "2A" "2" "2" "2B" ...
 $ BLOCK                         : num [1:84548] 392 399 399 402 404 405 406 407 379 387 ...
 $ LOT                           : num [1:84548] 6 26 39 21 55 16 32 18 34 153 ...
 $ EASE-MENT                     : logi [1:84548] NA NA NA NA NA NA ...
 $ BUILDING CLASS AT PRESENT     : chr [1:84548] "C2" "C7" "C7" "C4" ...
 $ ADDRESS                       : chr [1:84548] "153 AVENUE B" "234 EAST 4TH   STREET" "197 EAST 3RD   STREET" "154 EAST 7TH STREET" ...
 $ APARTMENT NUMBER              : chr [1:84548] NA NA NA NA ...
 $ ZIP CODE                      : num [1:84548] 10009 10009 10009 10009 10009 ...
 $ RESIDENTIAL UNITS             : num [1:84548] 5 28 16 10 6 20 8 44 15 24 ...
 $ COMMERCIAL UNITS              : num [1:84548] 0 3 1 0 0 0 0 2 0 0 ...
 $ TOTAL UNITS                   : num [1:84548] 5 31 17 10 6 20 8 46 15 24 ...
 $ LAND SQUARE FEET              : chr [1:84548] "1633" "4616" "2212" "2272" ...
 $ GROSS SQUARE FEET             : chr [1:84548] "6440" "18690" "7803" "6794" ...
 $ YEAR BUILT                    : num [1:84548] 1900 1900 1900 1913 1900 ...
 $ TAX CLASS AT TIME OF SALE     : num [1:84548] 2 2 2 2 2 2 2 2 2 2 ...
 $ BUILDING CLASS AT TIME OF SALE: chr [1:84548] "C2" "C7" "C7" "C4" ...
 $ SALE PRICE                    : chr [1:84548] "6625000" "-" "-" "3936272" ...
 $ SALE DATE                     : POSIXct[1:84548], format: "2017-07-19" ...
 - attr(*, "spec")=
  .. cols(
  ..   ...1 = col_double(),
  ..   BOROUGH = col_double(),
  ..   NEIGHBORHOOD = col_character(),
  ..   `BUILDING CLASS CATEGORY` = col_character(),
  ..   `TAX CLASS AT PRESENT` = col_character(),
  ..   BLOCK = col_double(),
  ..   LOT = col_double(),
  ..   `EASE-MENT` = col_logical(),
  ..   `BUILDING CLASS AT PRESENT` = col_character(),
  ..   ADDRESS = col_character(),
  ..   `APARTMENT NUMBER` = col_character(),
  ..   `ZIP CODE` = col_double(),
  ..   `RESIDENTIAL UNITS` = col_double(),
  ..   `COMMERCIAL UNITS` = col_double(),
  ..   `TOTAL UNITS` = col_double(),
  ..   `LAND SQUARE FEET` = col_character(),
  ..   `GROSS SQUARE FEET` = col_character(),
  ..   `YEAR BUILT` = col_double(),
  ..   `TAX CLASS AT TIME OF SALE` = col_double(),
  ..   `BUILDING CLASS AT TIME OF SALE` = col_character(),
  ..   `SALE PRICE` = col_character(),
  ..   `SALE DATE` = col_datetime(format = "")
  .. )
 - attr(*, "problems")=<externalptr> 
Change “SALE PRICE” to Numeric Variable
nyc_rolling_sales$`SALE PRICE` <- as.numeric(nyc_rolling_sales$`SALE PRICE`)
Change borough code numbers into borough names “Manhatten”, “Queens”, etc. In the dataset the different boroughs of New York City were represented with numbers 1-5, 1 Manhatten, 2 the Bronx, etc…
nyc_rolling_sales <- mutate(nyc_rolling_sales, BOROUGH = recode(BOROUGH, `1` = "Manhatten",
    `2` = "Bronx", `3` = "Brooklyn", `4` = "Queens", `5` = "Staten Island"))
Recode “BUILDING CLASS” to be more descriptive The building classes in the data were coded. Here I will recode R1-R4, essentially all the single unit residential properties in the dataset
nyc_rolling_sales <- mutate(nyc_rolling_sales, `BUILDING CLASS AT PRESENT` = recode(`BUILDING CLASS AT PRESENT`, 
    `R4` = "Condo Elevator Building", `R3` = "Condo 1 to 3 Stories ", 
    `R2` = "Condo Walkup", `R1` = "Condo 2 to 10 Unit Building"))
Select columns “BOROUGH”, “NEIGHBORHOOD”, “BUILDING CLASS AT PRESENT”, “RESIDENTIAL UNITS”, “GROSS SQUARE FEET”, “SALE PRICE”, “SALE DATE” Not all of the variables will be useful for my planned analysis, this removes those that I do not plan use
nyc_rolling_sales <-select(nyc_rolling_sales, "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS AT PRESENT",
    "RESIDENTIAL UNITS", "GROSS SQUARE FEET", "SALE PRICE", "SALE DATE")
Rename columns “BUILDING CLASS AT PRESENT”, and “RESIDENTIAL UNITS” This changes the names of some of the columns for clarity, “Residential Units” will become “Units” because I will only be looking at Residential Units. “BUILDING CLASS AT PRESENT” will become simply "BUILDING CLASS
nyc_rolling_sales <- rename(nyc_rolling_sales, "BUILDING CLASS" = "BUILDING CLASS AT PRESENT", 
    "SQ FOOTAGE" = "GROSS SQUARE FEET", "UNITS" = "RESIDENTIAL UNITS")
Remove all observations that do not contain a residential unit, and all sale prices under $100,000 and over $5,000,000 There were a lot very low prices that according to data description could indicate family member to family member transfer, i.e., inheritance. The data is also heavily skewed to the right, so I will look at only specific range of prices (i.e., $100,000-$5,000,000)
nyc_rolling_sales <- filter(nyc_rolling_sales, ! `UNITS` == "0" & ! `SALE PRICE` == "-" &
    `SALE PRICE` > 100000 & `SALE PRICE` < 5000000)
arrange(nyc_rolling_sales, `SALE PRICE`)
# A tibble: 37,897 × 7
   BOROUGH       NEIGHBORHOOD      `BUILDING CLASS` UNITS `SQ FOOTAGE`
   <chr>         <chr>             <chr>            <dbl> <chr>       
 1 Staten Island MIDLAND BEACH     "A6"                 1 588         
 2 Brooklyn      PARK SLOPE SOUTH  "C0"                 3 2781        
 3 Queens        FLUSHING-SOUTH    "Condo 1 to 3 S…     1 -           
 4 Queens        JAMAICA           "S2"                 2 3225        
 5 Bronx         MELROSE/CONCOURSE "C0"                 3 3972        
 6 Brooklyn      BOERUM HILL       "Condo Elevator…     1 0           
 7 Bronx         PARKCHESTER       "Condo Elevator…     1 0           
 8 Bronx         PARKCHESTER       "Condo Elevator…     1 0           
 9 Queens        REGO PARK         "B3"                 2 2128        
10 Bronx         PARKCHESTER       "Condo Elevator…     1 0           
# … with 37,887 more rows, and 2 more variables: `SALE PRICE` <dbl>,
#   `SALE DATE` <dttm>
head(nyc_rolling_sales) 
# A tibble: 6 × 7
  BOROUGH   NEIGHBORHOOD  `BUILDING CLASS`        UNITS `SQ FOOTAGE`
  <chr>     <chr>         <chr>                   <dbl> <chr>       
1 Manhatten ALPHABET CITY C4                         10 6794        
2 Manhatten ALPHABET CITY C4                          8 4226        
3 Manhatten ALPHABET CITY Condo Walkup                1 -           
4 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
5 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
6 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
# … with 2 more variables: `SALE PRICE` <dbl>, `SALE DATE` <dttm>
Identify and view the different types of building classes Each of these identifiers is defined in the description of the dataset, R4s (“Condo; Residential Unit in Elevator Building”) are the most numerous. It will be most revealing to compare like dwellings so I have renamed R1-R4, and I will filter out the rest
tally(~ `BUILDING CLASS`, data = nyc_rolling_sales)
BUILDING CLASS
                         A0                          A1 
                        307                        4666 
                         A2                          A3 
                       1895                         238 
                         A4                          A5 
                         91                        4023 
                         A6                          A7 
                         62                           2 
                         A9                          B1 
                        949                        2718 
                         B2                          B3 
                       3124                        2437 
                         B9                          C0 
                        918                        2245 
                         C1                          C2 
                        267                         369 
                         C3                          C4 
                        538                          46 
                         C5                          C6 
                         63                           7 
                         C7                          C9 
                         66                           5 
      Condo 1 to 3 Stories  Condo 2 to 10 Unit Building 
                       1076                         934 
    Condo Elevator Building                Condo Walkup 
                       8931                         646 
                         D1                          D3 
                         11                           3 
                         D4                          D6 
                         24                           6 
                         D7                          D9 
                          2                           2 
                         E9                          F9 
                          2                           2 
                         G0                          G4 
                          7                           1 
                         G9                          HR 
                          1                           2 
                         I7                          K1 
                          2                           2 
                         K2                          K4 
                          2                          99 
                         K9                          M1 
                          3                           1 
                         M9                          N2 
                          5                           1 
                         N9                          O2 
                          2                           1 
                         O5                          O7 
                          2                           1 
                         O8                          P5 
                         18                           2 
                         R6                          RR 
                        129                           9 
                         S0                          S1 
                          7                         202 
                         S2                          S3 
                        403                          79 
                         S4                          S5 
                         61                          50 
                         S9                          V0 
                         96                          28 
                         V1                          W2 
                          3                           1 
                         Z0                          Z9 
                          1                           1 
Filter “Building Class” for “Condo Elevator Building”, previously “R4” Here I have filtered for just “Condo Elevator Building”
nyc_rolling_sales<- filter(nyc_rolling_sales, `BUILDING CLASS` == "Condo Elevator Building")
head(nyc_rolling_sales)
# A tibble: 6 × 7
  BOROUGH   NEIGHBORHOOD  `BUILDING CLASS`        UNITS `SQ FOOTAGE`
  <chr>     <chr>         <chr>                   <dbl> <chr>       
1 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
2 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
3 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
4 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
5 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
6 Manhatten ALPHABET CITY Condo Elevator Building     1 -           
# … with 2 more variables: `SALE PRICE` <dbl>, `SALE DATE` <dttm>

The Variables in the dataset after cleaning the data

Exploratory analyiss

Create Object “Brooklyn_1” for property sales of “CONDO ELEVATOR BUILDINGs” in Brooklyn
Brooklyn_1 <- filter(nyc_rolling_sales, `BOROUGH` == "Brooklyn")
Create Object “Manhatten_1” for property sales of “CONDO ELEVATOR BUILDINGs” in Manhatten
Manhatten_1 <- filter(nyc_rolling_sales, `BOROUGH` == "Manhatten")
Display Boxplot of “SALE PRICE” of “CONDO ELEVATOR BUILDINGS” in Brooklyn & Manhatten Here we can still clearly see that even after looking at only a range of the “SALE PRICE” the data is still heavily skewed to the right with numerous outliers. As a result, I will use median as a measure of center.
options(scipen = 999) ##Remove Scientific Notation
boxplot(Brooklyn_1$`SALE PRICE`, Manhatten_1$`SALE PRICE`, horizontal = TRUE, names=c("Brooklyn", "Manhatten"), 
    main = "Sales of Single Unit Homes in Brooklyn & Manhatten", ylab = "Boroughs", xlab = "Sale Price") 

Display summary statistics of “Brooklyn_1” and “SALE PRICE” and “Manhatten_1” and “SALE PRICE”
favstats(Brooklyn_1$`SALE PRICE`)
    min     Q1 median      Q3     max    mean       sd    n missing
 101825 682227 936270 1482500 4938512 1206647 806376.9 2159       0
favstats(Manhatten_1$`SALE PRICE`)
    min     Q1  median      Q3     max    mean      sd    n missing
 150000 900000 1396500 2310000 4996841 1734470 1074408 4783       0
Find the median “SALE PRICE” of “CONDO ELEVATOR BUILDINGs” in each of the five New York City “BOROUGHS”
nyc_rolling_sales %>%
  group_by(BOROUGH) %>%
  select("SALE PRICE") %>%
  summarize_all(median, na.rm = TRUE)
# A tibble: 5 × 2
  BOROUGH       `SALE PRICE`
  <chr>                <dbl>
1 Bronx               143500
2 Brooklyn            936270
3 Manhatten          1396500
4 Queens              643000
5 Staten Island       240318
Find the median “SALE PRICE” of “CONDO ELEVATOR BUILDINGs” in each of the different Manhatten “NEIGHBORHOODs”
Manhatten_1 %>%
  group_by(NEIGHBORHOOD) %>%
  select("SALE PRICE") %>%
  summarize_all(median, na.rm = TRUE)
# A tibble: 38 × 2
   NEIGHBORHOOD  `SALE PRICE`
   <chr>                <dbl>
 1 ALPHABET CITY     1310000 
 2 CHELSEA           1820000 
 3 CHINATOWN         1440000 
 4 CIVIC CENTER      3250000 
 5 CLINTON           1236250 
 6 EAST VILLAGE      1285000 
 7 FASHION           1735000 
 8 FINANCIAL         1450000 
 9 FLATIRON          2300000 
10 GRAMERCY          1834270.
# … with 28 more rows

Potential Research Questions

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Farrell (2022, Feb. 16). Data Analytics and Computational Social Science: Homework3. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211866926/

BibTeX citation

@misc{farrell2022homework3,
  author = {Farrell, Joseph},
  title = {Data Analytics and Computational Social Science: Homework3},
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomjnfarrell211866926/},
  year = {2022}
}