Real Estate Data Analysis

Assignment 1
Lujia Li
NeighborhoodSales Summary
ggplot2
Author

Lujia Li

Published

May 3, 2023

Code
# read dataset
library(readxl)
library(dbplyr)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.1     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.4     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::ident()  masks dbplyr::ident()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::sql()    masks dbplyr::sql()
Code
brooklyn2019 <- read_excel('Assignment_1_LujiaLi_folder/2019_brooklyn.xlsx', skip = 6)
Error: `path` does not exist: 'Assignment_1_LujiaLi_folder/2019_brooklyn.xlsx'
Code
head(brooklyn2019)
Error in head(brooklyn2019): object 'brooklyn2019' not found
Code
# dimension of dataset
dim(brooklyn2019)
Error in eval(expr, envir, enclos): object 'brooklyn2019' not found
Code
# variables/column names
colnames(brooklyn2019)
Error in is.data.frame(x): object 'brooklyn2019' not found
Code
# remove the special characters in variables
colnames(brooklyn2019) <- colnames(brooklyn2019) %>% 
  gsub("\r", "", .) %>%    # delete the special character "\r" in variables
  gsub("\n", " ", .) %>%   # replace the special character "\n" with space in variables
  gsub(" $", "",. , perl = T) %>%  # delete the space at the end of variables
  gsub("  ", " ",. , perl = T)  # replace the double spaces with single space in variables
Error in is.data.frame(x): object 'brooklyn2019' not found
Code
colnames(brooklyn2019)
Error in is.data.frame(x): object 'brooklyn2019' not found
Code
# remove duplicated rows in dataset
brooklyn2019 <- distinct(brooklyn2019)
Error in distinct(brooklyn2019): object 'brooklyn2019' not found
Code
str(brooklyn2019)
Error in str(brooklyn2019): object 'brooklyn2019' not found
Code
# subset dataset
# c('BOROUGH','RESIDENTIAL UNITS','COMMERCIAL UNITS','TOTAL UNITS','LAND SQUARE FEET','GROSS SQUARE FEET',
#   'YEAR BUILT','BUILDING CLASS AT TIME OF SALE','SALE PRICE','SALE DATE') %>%
#    subset(brooklyn2019, select = .) -> selected_brooklyn2019

selected_brooklyn2019 <- subset(brooklyn2019, select = c('BOROUGH','RESIDENTIAL UNITS','COMMERCIAL UNITS','TOTAL UNITS','LAND SQUARE FEET','GROSS SQUARE FEET',
  'YEAR BUILT','BUILDING CLASS AT TIME OF SALE','SALE PRICE','SALE DATE'))
Error in subset(brooklyn2019, select = c("BOROUGH", "RESIDENTIAL UNITS", : object 'brooklyn2019' not found
Code
str(selected_brooklyn2019)
Error in str(selected_brooklyn2019): object 'selected_brooklyn2019' not found
Code
# count the number and calculate the percentage of NA values
NumNA_column <- function(x){sum(is.na(x))} # function to count the number of NA values
apply(selected_brooklyn2019, 2, NumNA_column)   # number of NA in each column
Error in apply(selected_brooklyn2019, 2, NumNA_column): object 'selected_brooklyn2019' not found
Code
PercentageNA_column <- function(x){sum(is.na(x))/length(x)*100}
apply(selected_brooklyn2019, 2, PercentageNA_column) # percentage of NA in each column
Error in apply(selected_brooklyn2019, 2, PercentageNA_column): object 'selected_brooklyn2019' not found
Code
# count the number of 0 sales price
# 0 sale price means that a transfer of ownership without a cash consideration
table(selected_brooklyn2019$`SALE PRICE` == 0)
Error in table(selected_brooklyn2019$`SALE PRICE` == 0): object 'selected_brooklyn2019' not found
Code
prop.table(table(selected_brooklyn2019$`SALE PRICE` == 0))
Error in table(selected_brooklyn2019$`SALE PRICE` == 0): object 'selected_brooklyn2019' not found
Code
# remove rows with NA values and 0 sale price
# 0 sale price means that a transfer of ownership without a cash consideration
new_brooklyn2019 <- na.omit(selected_brooklyn2019) %>% # delete rows with NA values
  filter(., 'SALE PRICE' != 0)     # select rows with non-zero sale price
Error in na.omit(selected_brooklyn2019): object 'selected_brooklyn2019' not found
Code
str(new_brooklyn2019)
Error in str(new_brooklyn2019): object 'new_brooklyn2019' not found
Code
# statics of dataset
summary(new_brooklyn2019)
Error in summary(new_brooklyn2019): object 'new_brooklyn2019' not found
Code
# merge all sub-datasets
final_dataframe <- data.frame()  # define a dataframe
filename <- list.files("Assignment_1_LujiaLi_folder")   # names of all .xlsx files
for (i in filename){
  dataframe <- read_excel(gsub(" ", "", paste("Assignment_1_LujiaLi_folder/", i)), skip = 6)
  colnames(dataframe) <- colnames(dataframe) %>% 
    gsub("\r", "", .) %>%    # delete the special character "\r" in column names
    gsub("\n", " ", .) %>%   # replace the special character "\n" with space in column names
    gsub(" $","",. ,perl = T) %>%  # delete the space at the tail of column names
    gsub("  "," ",. ,perl = T)  # replace the double spaces with single space in column names
  
  # remove duplicated rows in dataset
  dataframe <- distinct(dataframe)
  
  # subset dataset
  # c('BOROUGH','RESIDENTIAL UNITS','COMMERCIAL UNITS','TOTAL UNITS','LAND SQUARE FEET','GROSS SQUARE FEET',
  #   'YEAR BUILT','BUILDING CLASS AT TIME OF SALE','SALE PRICE','SALE DATE') %>%
  #   subset(dataframe, select = .) -> selected_dataframe
  
  selected_dataframe <- subset(dataframe, select = c('BOROUGH','RESIDENTIAL UNITS','COMMERCIAL UNITS','TOTAL UNITS','LAND SQUARE FEET','GROSS SQUARE FEET',
    'YEAR BUILT','BUILDING CLASS AT TIME OF SALE','SALE PRICE','SALE DATE'))
  
  # remove rows with NA values and 0 sale price
  # 0 sale price means that a transfer of ownership without a cash consideration
  new_dataframe <- na.omit(selected_dataframe) %>%
    filter(., `SALE PRICE` != 0)
  # merge dataframe
  final_dataframe <- rbind(final_dataframe, new_dataframe)   
}
str(final_dataframe)
'data.frame':   0 obs. of  0 variables