Code
library(tidyverse)
::opts_chunk$set(echo = TRUE) knitr
Karla Barrett-Dexter
December 11, 2022
Error: '_data/od.csv' does not exist in current working directory ('C:/Users/srika/OneDrive/Desktop/601_Fall_2022/posts').
Error in eval(expr, envir, enclos): object 'MillenialMigration' not found
As a continuation from HW2, I renamed the columns again for ease of understanding and additionally, I separated the race/parental column and created two separate columns for each variable because I was interested in exploring these data points independently of each other.I found it was taking a very long time to run this code every time I returned to work on the assignment, so I decided to export the new CSV file and am writing in all the code used to get to this step as comments to show my work and continuing with the new file to avoid the time issues I was having running the code.
#Changed column names for ease of understanding
MillenialMigration <- MillenialMigration %>%
rename(Origin_Zone = o_cz,
Origin_City = o_cz_name,
Origin_State = o_state_name,
Dest_Zone = d_cz,
Dest_City = d_cz_name,
Dest_State = d_state_name,
Num_Migrators = n,
N_from_Origin = n_tot_o,
N_from_Dest = n_tot_d,
Race_ParentalIncome = pool,
Num_Migrators = n)
Error in rename(., Origin_Zone = o_cz, Origin_City = o_cz_name, Origin_State = o_state_name, : object 'MillenialMigration' not found
Error in eval(expr, envir, enclos): object 'MillenialMigration' not found
#I wanted to separate the Race and Parental Income data, in order to analyze the data separately in future iterations. I could not figure out a way to separate the two without putting a character in between. I used the following code to update the Race_ParentalIncome column to have an underscore in it. This was probably not the most efficient way to accomplish this outcome and it took me quite a while to get the code right.In addition, I found it was taking a very long time to run this code every time I returned to work on the assignment, so I decided to export the new CSV file and am writing in all the code used to get to this step as comments to show my work and continuing with the new file to avoid the time issues I was having running the code.
#MillenialMigration_ <- MillenialMigration %>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ1", "Asian_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ2", "Asian_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ3", "Asian_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ4", "Asian_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ5", "Asian_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ1", "Black_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ2", "Black_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ3", "Black_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ4", "Black_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ5", "Black_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ1", "Hispanic_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ2", "Hispanic_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ3", "Hispanic_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ4", "Hispanic_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ5", "Hispanic_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ1", "Other_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ2", "Other_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ3", "Other_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ4", "Other_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ5", "Other_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ1", "White_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ2", "White_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ3", "White_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ4", "White_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ5", "White_Q5"))
#I used the following code to check my work
#MigratorsByRace_Income <- MillenialMigration_ %>%
# group_by(Race_ParentalIncome) %>%
#summarise(Freq = sum(Num_Migrators))
#print(n=30, MigratorsByRace_Income)
#The following code was used to separate the column and create two new columns, one for Race and one for Parental Income.
#MillenialMigration_Sep <- separate(MillenialMigration_, Race_ParentalIncome, into = c("Race", "Parental_Income"), sep = "_")
#MillenialMigration_Sep
#The following code was used to create a new CSV file with the separated columns.
#write.csv(MillenialMigration_Sep, file = "C:\\Users\\kbarr\\OneDrive\\Documents\\GitHub\\601_Fall_2022\\posts\\MillenialMigration_Sep.csv", row.names = FALSE)
MillenialMigration_Sep <- read_csv("_data/MillenialMigration_Sep.csv")
Error: '_data/MillenialMigration_Sep.csv' does not exist in current working directory ('C:/Users/srika/OneDrive/Desktop/601_Fall_2022/posts').
Error in eval(expr, envir, enclos): object 'MillenialMigration_Sep' not found
This dataset show migrations patterns for people born between the years 1984 and 1992. I found the dataset through data-is-plural.com, which led me to migrationpatterns.org, where I downloaded the data from. The data was originally taken from the US Census, tax, and HUD information. The origin zone, city, and state is the location of an individual at age 16 and the destination commute zone, city, and state is the location of an individual at age 26. The dataset also includes information on race/ethnicity and parental income.
I found the mean, median, and standard deviation for each numerical variable and the frequencies for each categorical variable, which I found more interesting to keep exploring.
The same 10 states experienced the most migration in and out: CA TX NY FL PA OH IL GA NC MI
California was the top state for both migration in and out.
Error in summarise(MillenialMigration_Sep, mean.TotalMigrators = mean(Num_Migrators, : object 'MillenialMigration_Sep' not found
Error in summarise(MillenialMigration_Sep, median.TotalMigrators = median(Num_Migrators, : object 'MillenialMigration_Sep' not found
Error in summarise(MillenialMigration_Sep, SD.TotalMigrators = sd(Num_Migrators, : object 'MillenialMigration_Sep' not found
Error in group_by(., Dest_State): object 'MillenialMigration_Sep' not found
Error in arrange(., desc(Num_Migrators)): object 'Migration_DestState' not found
Error in eval(expr, envir, enclos): object 'Migration_DestState' not found
Error in group_by(., Origin_State): object 'MillenialMigration_Sep' not found
Error in arrange(., desc(Num_Migrators)): object 'Migration_OriginState' not found
Error in eval(expr, envir, enclos): object 'Migration_OriginState' not found
Error in group_by(., Parental_Income): object 'MillenialMigration_Sep' not found
Error in eval(expr, envir, enclos): object 'MigratorsByIncome' not found
Error in group_by(., Race): object 'MillenialMigration_Sep' not found
Error in eval(expr, envir, enclos): object 'MigratorsByRace' not found
I made two simple point plots and bar graphs to look at the total number of migrators by parental income quintile and race. The two key takeaways are that the highest number of migrators were white and in the highest income quartile.
I also created a plot point to show the total number of migrators to and from each state. This graph is not all that useful and I will continue to consider how it can be improved.
Error in ggplot(MigratorsByIncome, aes(Parental_Income, Num_Migrators)): object 'MigratorsByIncome' not found
Error in ggplot(data = MigratorsByIncome): object 'MigratorsByIncome' not found
Error in ggplot(MigratorsByRace, aes(Race, Num_Migrators)): object 'MigratorsByRace' not found
Error in ggplot(data = MigratorsByRace): object 'MigratorsByRace' not found
Error in rename(., Num_MigratorsOut = Num_Migrators, State = Origin_State): object 'Migration_OriginState' not found
Error in rename(., Num_Migrators_In = Num_Migrators, State = Dest_State): object 'Migration_DestState' not found
Error in inner_join(Migration_OriginState, Migration_DestState, by = "State"): object 'Migration_OriginState' not found
Error in eval(expr, envir, enclos): object 'Migration_In_And_Out' not found
Error in ggplot(Migration_In_And_Out, aes(Num_MigratorsOut, Num_Migrators_In)): object 'Migration_In_And_Out' not found
I decided to further explore the migration patterns for California, as it was the state that saw the most movement in and out. I looked at the migration by Race and found that White and Hispanic people migrated the most, with a slight variation between in (White more in) and out (Hispanic more out).
Error in filter(., Dest_State == "California"): object 'MillenialMigration_Sep' not found
Error in filter(., Origin_State == "California"): object 'MillenialMigration_Sep' not found
Error in group_by(., Race): object 'MillenialMigration_to_CA' not found
Error in group_by(., Race): object 'MillenialMigration_from_CA' not found
Error in eval(expr, envir, enclos): object 'CA_Migrators_By_Race_From' not found
Error in eval(expr, envir, enclos): object 'CA_Migrators_By_Race_To' not found
Error in inner_join(CA_Migrators_By_Race_To, CA_Migrators_By_Race_From, : object 'CA_Migrators_By_Race_To' not found
Error in eval(expr, envir, enclos): object 'CA_Migration_In_And_Out' not found
Error in ggplot(data = CA_Migration_In_And_Out): object 'CA_Migration_In_And_Out' not found
Error in ggplot(data = CA_Migration_In_And_Out): object 'CA_Migration_In_And_Out' not found
Some limitations and questions I am still exploring are: -How was the sample chosen? -How much migration is not captured by census data? -Is this data useful for municipalities to prepare for new citizens? -What are the income ranges for the parental income quintiles? -Can I change the Y axis scaling? The bar chart and plot point graph for total migrators by income have different Y axis labels.
---
title: "HW3"
author: "Karla Barrett-Dexter"
desription: "Millenial Migration Patterns"
date: "12/11/2022"
format:
html:
toc: true
code-fold: true
code-copy: true
code-tools: true
df-print: paged
categories:
- hw3
- Karla Barrett-Dexter
- Millenial Migration Data
---
```{r}
#| label: setup
#| warning: false
library(tidyverse)
knitr::opts_chunk$set(echo = TRUE)
```
## Read in the data
```{r}
#| label: Read in data
#| warning: false
MillenialMigration <- read_csv("_data/od.csv")
MillenialMigration
#comment to test changes
#2nd comment to test changes
```
## Tidy Data
As a continuation from HW2, I renamed the columns again for ease of understanding and additionally, I separated the race/parental column and created two separate columns for each variable because I was interested in exploring these data points independently of each other.I found it was taking a very long time to run this code every time I returned to work on the assignment, so I decided to export the new CSV file and am writing in all the code used to get to this step as comments to show my work and continuing with the new file to avoid the time issues I was having running the code.
```{r}
#| label: Tidy data
#| warning: false
#Changed column names for ease of understanding
MillenialMigration <- MillenialMigration %>%
rename(Origin_Zone = o_cz,
Origin_City = o_cz_name,
Origin_State = o_state_name,
Dest_Zone = d_cz,
Dest_City = d_cz_name,
Dest_State = d_state_name,
Num_Migrators = n,
N_from_Origin = n_tot_o,
N_from_Dest = n_tot_d,
Race_ParentalIncome = pool,
Num_Migrators = n)
MillenialMigration
#I wanted to separate the Race and Parental Income data, in order to analyze the data separately in future iterations. I could not figure out a way to separate the two without putting a character in between. I used the following code to update the Race_ParentalIncome column to have an underscore in it. This was probably not the most efficient way to accomplish this outcome and it took me quite a while to get the code right.In addition, I found it was taking a very long time to run this code every time I returned to work on the assignment, so I decided to export the new CSV file and am writing in all the code used to get to this step as comments to show my work and continuing with the new file to avoid the time issues I was having running the code.
#MillenialMigration_ <- MillenialMigration %>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ1", "Asian_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ2", "Asian_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ3", "Asian_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ4", "Asian_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "AsianQ5", "Asian_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ1", "Black_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ2", "Black_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ3", "Black_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ4", "Black_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "BlackQ5", "Black_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ1", "Hispanic_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ2", "Hispanic_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ3", "Hispanic_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ4", "Hispanic_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "HispanicQ5", "Hispanic_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ1", "Other_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ2", "Other_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ3", "Other_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ4", "Other_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "OtherQ5", "Other_Q5"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ1", "White_Q1"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ2", "White_Q2"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ3", "White_Q3"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ4", "White_Q4"))%>%
# mutate(Race_ParentalIncome = stringr::str_replace(Race_ParentalIncome, "WhiteQ5", "White_Q5"))
#I used the following code to check my work
#MigratorsByRace_Income <- MillenialMigration_ %>%
# group_by(Race_ParentalIncome) %>%
#summarise(Freq = sum(Num_Migrators))
#print(n=30, MigratorsByRace_Income)
#The following code was used to separate the column and create two new columns, one for Race and one for Parental Income.
#MillenialMigration_Sep <- separate(MillenialMigration_, Race_ParentalIncome, into = c("Race", "Parental_Income"), sep = "_")
#MillenialMigration_Sep
#The following code was used to create a new CSV file with the separated columns.
#write.csv(MillenialMigration_Sep, file = "C:\\Users\\kbarr\\OneDrive\\Documents\\GitHub\\601_Fall_2022\\posts\\MillenialMigration_Sep.csv", row.names = FALSE)
MillenialMigration_Sep <- read_csv("_data/MillenialMigration_Sep.csv")
MillenialMigration_Sep
```
## Narrative
This dataset show migrations patterns for people born between the years 1984 and 1992. I found the dataset through data-is-plural.com, which led me to migrationpatterns.org, where I downloaded the data from. The data was originally taken from the US Census, tax, and HUD information. The origin zone, city, and state is the location of an individual at age 16 and the destination commute zone, city, and state is the location of an individual at age 26. The dataset also includes information on race/ethnicity and parental income.
## Descriptive Statistics
I found the mean, median, and standard deviation for each numerical variable and the frequencies for each categorical variable, which I found more interesting to keep exploring.
The same 10 states experienced the most migration in and out:
CA
TX
NY
FL
PA
OH
IL
GA
NC
MI
California was the top state for both migration in and out.
```{r}
#| label: Descriptive Statistics
#| warning: false
#Mean, median, and standard deviation for numerical variables
summarise(MillenialMigration_Sep, mean.TotalMigrators=mean(Num_Migrators, na.rm=TRUE), mean.NfromOrigin=mean(N_from_Origin, na.rm=TRUE), mean.NFromDest=mean(N_from_Dest, na.rm=TRUE))
summarise(MillenialMigration_Sep, median.TotalMigrators=median(Num_Migrators, na.rm=TRUE), median.NfromOrigin=median(N_from_Origin, na.rm=TRUE), median.NFromDest=median(N_from_Dest, na.rm=TRUE))
summarise(MillenialMigration_Sep, SD.TotalMigrators=sd(Num_Migrators, na.rm=TRUE), sd.NfromOrigin=sd(N_from_Origin, na.rm=TRUE), sd.NFromDest=sd(N_from_Dest, na.rm=TRUE))
#Frequencies for categorical variables
#Number of migrators to a destination, in descending order
Migration_DestState <- MillenialMigration_Sep %>%
group_by(Dest_State) %>%
summarise(Num_Migrators = sum(Num_Migrators))
Migration_DestState <- Migration_DestState %>% arrange(desc(Num_Migrators))
Migration_DestState
#Number of migrators from a destination, in descending order
Migration_OriginState <- MillenialMigration_Sep %>%
group_by(Origin_State) %>%
summarise(Num_Migrators = sum(Num_Migrators))
Migration_OriginState <- Migration_OriginState %>% arrange(desc(Num_Migrators))
Migration_OriginState
#Number of migrators by income quintile
MigratorsByIncome <- MillenialMigration_Sep %>%
group_by(Parental_Income) %>%
summarise(Num_Migrators= sum(Num_Migrators))
MigratorsByIncome
#Number of migrators by race
MigratorsByRace <- MillenialMigration_Sep %>%
group_by(Race) %>%
summarise(Num_Migrators= sum(Num_Migrators))
MigratorsByRace
```
## Visualizations
I made two simple point plots and bar graphs to look at the total number of migrators by parental income quintile and race. The two key takeaways are that the highest number of migrators were white and in the highest income quartile.
I also created a plot point to show the total number of migrators to and from each state. This graph is not all that useful and I will continue to consider how it can be improved.
```{r}
#| label: Visualizations
#| warning: false
#I used the following to create simple point plot for number of migrators by income and race
ggplot(MigratorsByIncome, aes(Parental_Income, Num_Migrators)) + geom_point(color="blue", shape=0) + labs(title = "Migrators by Income")
ggplot(data = MigratorsByIncome)+
geom_bar(mapping = aes(x = Parental_Income, y = Num_Migrators, fill=Parental_Income), stat = "identity")+ labs(title = "Migrators by Income")
ggplot(MigratorsByRace, aes(Race, Num_Migrators)) + geom_point() + labs(title = "Migrators by Race")
ggplot(data = MigratorsByRace)+
geom_bar(mapping = aes(x = Race, y = Num_Migrators, fill=Race), stat = "identity")+ labs(title = "Migrators by Race")
#I used the following code to create a new table with the total migrators in (dest) and out (origin) of each state
Migration_OriginState <- Migration_OriginState %>%
rename(Num_MigratorsOut = Num_Migrators,
State = Origin_State)
Migration_DestState <- Migration_DestState %>%
rename(Num_Migrators_In = Num_Migrators,
State = Dest_State)
Migration_In_And_Out <- inner_join(Migration_OriginState, Migration_DestState, by="State")
Migration_In_And_Out
#I used the following code to create a point plot to compare the number of migrators going in and out of each state
ggplot(Migration_In_And_Out, aes(Num_MigratorsOut, Num_Migrators_In)) + geom_point(mapping = aes(color=State))
```
## Groupings
I decided to further explore the migration patterns for California, as it was the state that saw the most movement in and out. I looked at the migration by Race and found that White and Hispanic people migrated the most, with a slight variation between in (White more in) and out (Hispanic more out).
```{r}
#| label: Groupings
#| warning: false
MillenialMigration_to_CA <- MillenialMigration_Sep %>%
filter(Dest_State== "California")
MillenialMigration_from_CA <- MillenialMigration_Sep %>%
filter(Origin_State== "California")
CA_Migrators_By_Race_To <- MillenialMigration_to_CA %>%
group_by(Race) %>%
summarise(Num.Migrators.To.CA = sum(Num_Migrators))
CA_Migrators_By_Race_From <- MillenialMigration_from_CA %>%
group_by(Race) %>%
summarise(Num.Migrators.From.CA = sum(Num_Migrators))
CA_Migrators_By_Race_From
CA_Migrators_By_Race_To
CA_Migration_In_And_Out <- inner_join(CA_Migrators_By_Race_To, CA_Migrators_By_Race_From, by="Race")
CA_Migration_In_And_Out
ggplot(data = CA_Migration_In_And_Out)+
geom_bar(mapping = aes(x = Race, y = Num.Migrators.To.CA, fill=Race), stat = "identity")+ labs(title = "Migrators to CA by Race")
ggplot(data = CA_Migration_In_And_Out)+
geom_bar(mapping = aes(x = Race, y = Num.Migrators.From.CA, fill=Race), stat = "identity")+ labs(title = "Migrators From CA by Race")
```
## Limitations
Some limitations and questions I am still exploring are:
-How was the sample chosen?
-How much migration is not captured by census data?
-Is this data useful for municipalities to prepare for new citizens?
-What are the income ranges for the parental income quintiles?
-Can I change the Y axis scaling? The bar chart and plot point graph for total migrators by income have different Y axis labels.