Exploration of 1987 Crime data
knitr::opts_chunk$set(echo = TRUE)
#read in and tidy 1987 crime data
#load libraries
library(tidyverse)
library(readxl)
#read in the file
crime1987 <- read_excel("~/DATA/crime data/da09252.xlsx",
col_types = c("numeric", "numeric", "numeric",
"numeric", "text", "text", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric"))
#tidy FIPS codes
#rename select columns as a first step to tidying the FIPS codescrime1987 <- mutate(crime1987, FIPS5 = paste("county", "state"))
crime1987 <- crime1987 %>%
rename(FIPS_state = `FIPS STATE CODE - SEE CODEBOOK`,
FIPS_county = `FIPS COUNTY CODE - SEE CODEBOOK`)
#create new state column with padded 0s. Doing it this way preserves FIPS_state as is, which I will combine with FIPS_county in a few steps to create the string that matches to the county name in this dataset. I need to preserve them separately because the FIPS codes are formatted differently in other datasets
crime1987 <- mutate(crime1987, state = FIPS_state)
crime1987$state <-str_pad(crime1987$state, 2, pad = "0")
#add leading 0s to FIPS_county and duplicate it so that it still exists after I combine them in the next step
crime1987$FIPS_county <- str_pad(crime1987$FIPS_county, 3, pad = "0")
crime1987 <- mutate(crime1987, county = FIPS_county)
#unite() combines the state and county into a single variable separated by ".", FIPS. However, it does not preserve the original two variables, which is why I preserved them by duplicating them in the last steps.
crime1987 <- unite(crime1987, FIPS, FIPS_state, FIPS_county, sep = ".")
#create a new variable, FIPS5 that combines county and state into a single variable
crime1987 <- mutate(crime1987, FIPS5 = paste(state,county, sep = ""))
#create arrest rate variables, first renaming the columns to avoid confusion
crime1987 <- crime1987 %>%
rename(MJ_manufact_arrests = "MARIJUANA SALE/MANUF", MJ_possession_arrests = "MARIJUANA POSSESSION") %>%
mutate(MJ_manufact_RATE = MJ_manufact_arrests / POPULATION * 100000,
MJ_possession_RATE = MJ_possession_arrests / POPULATION * 100000)
#create a new datatable, MJcrime1987 that narrows just to the info I'm interested in
#select the relevant columns and eliminate counties with 0 population and 0 MJ_manufact_arrests
MJcrime1987 <- filter(crime1987, MJ_manufact_RATE > 0, POPULATION != 0) %>%
select(FIPS, FIPS5, POPULATION, MJ_manufact_arrests, MJ_manufact_RATE, MJ_possession_arrests, MJ_possession_RATE)
I want to read in and clean the Rural-Urban Continuum Codes: https://www.ers.usda.gov/data-products/rural-urban-continuum-codes/ and USDA ERS County Typology codes: https://www.ers.usda.gov/data-products/county-typology-codes/
#read in USDA rural urban continuum codes from the USDA
ruralurbancodes83_93 <- read_excel("~/DATA/USDA data originals and codebook/ruralurbancodes83_93.xls")
#join crime1987 and MJcrime1987 with ruralurbancodes83_93 by FIPS5
crime1987_countytype <- inner_join(crime1987, ruralurbancodes83_93, by = "FIPS5")
MJcrime1987_countytype <- inner_join(MJcrime1987, ruralurbancodes83_93, by = "FIPS5")
These are county typologies that give information about the economic activity within counties. There are 6 mutually exclusive economic types: farming, mining, manufacturing, government, services, nonspecialized. A county with at least 20% of its economy dependent on a particular sector in 1989 will have a 1 in that column in the table. There are 5 additional categories that are not mutually exclusive: retirement-destination, Federal lands, persistent poverty, commuting and transfers-dependent.
countytypology89 <- read_excel("~/DATA/USDA data originals and codebook/typology89.xls")
#create new variables, econ_type and econ_char using the 11 columns from the USDA data
countytypology89 <- countytypology89 %>%
mutate(econ_type = case_when(
`FM` == 1 ~ "Farming Dependent",
`MI` == 1 ~ "Mining Dependent",
`MF` == 1 ~ "Manufacturing Dependent",
`GV` == 1 ~ "Government Dependent",
`TS` == 1 ~ "Services Dependent",
`NS` == 1 ~ "Nonspecialized"),
econ_char = case_when(
`RT` == 1 ~ "Retirement Destination",
`FL` == 1 ~ "Federal Lands",
`CM` == 1 ~ "Commuting",
`PV` == 1 ~ "Persistent Poverty",
`TP` == 1 ~ "Transfers-dependent")
)
# join with the other data
MJcrime1987_allCountyinfo <- inner_join(MJcrime1987_countytype, countytypology89, by = "FIPS5")
crime1987_allCountyinfo <- inner_join(crime1987_countytype, countytypology89, by = "FIPS5")
#narrow down to the relevant stats
tidy_MJ_Counties <- MJcrime1987_allCountyinfo %>%
select("FIPS5", "POPULATION", "MJ_manufact_arrests", "MJ_manufact_RATE", "MJ_possession_arrests", "MJ_possession_RATE", "State.x", "County Name", "1993 Rural-urban Continuum Code","RuralUrban1993", "econ_type", "econ_char")
tidy_MJ_Counties
# A tibble: 1,857 x 12
FIPS5 POPULATION MJ_manufact_arr~ MJ_manufact_RATE MJ_possession_a~
<chr> <dbl> <dbl> <dbl> <dbl>
1 01003 92364 23 24.9 114
2 01005 25492 5 19.6 28
3 01009 39295 5 12.7 35
4 01015 124738 14 11.2 223
5 01017 40102 19 47.4 29
6 01019 19345 11 56.9 17
7 01021 31336 6 19.1 28
8 01023 17129 7 40.9 8
9 01031 40505 17 42.0 58
10 01033 54913 14 25.5 76
# ... with 1,847 more rows, and 7 more variables:
# MJ_possession_RATE <dbl>, State.x <chr>, County Name <chr>,
# 1993 Rural-urban Continuum Code <dbl>, RuralUrban1993 <chr>,
# econ_type <chr>, econ_char <chr>
I am going to focus for now on tidy_MJ_Counties rather than the full data.
#find the mean and other summary stats for the arrest rate
tidy_MJ_Counties %>%
summarize(mean(MJ_manufact_RATE), median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE)
# A tibble: 1 x 7
`mean(MJ_manufac~ `median(MJ_manu~ `sd(MJ_manufact~ `mean(MJ_posses~
<dbl> <dbl> <dbl> <dbl>
1 35.3 19.8 62.4 112.
# ... with 3 more variables: median(MJ_possession_RATE) <dbl>,
# sd(MJ_possession_RATE) <dbl>, na.rm <lgl>
#Median would be a better measure, I think, since the data is so skewed.
#group_by econ_type
tidy_MJ_Counties %>%
group_by(econ_type) %>%
summarize(mean(MJ_manufact_RATE), median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE)
# A tibble: 7 x 8
econ_type `mean(MJ_manufact~ `median(MJ_manufa~ `sd(MJ_manufact_~
<chr> <dbl> <dbl> <dbl>
1 Farming Dep~ 59.1 31.4 85.7
2 Government ~ 57.7 29.9 155.
3 Manufacturi~ 32.5 20.1 35.7
4 Mining Depe~ 43.6 29.7 55.9
5 Nonspeciali~ 34.8 19.6 38.0
6 Services De~ 38.8 19.7 48.7
7 <NA> 23.8 15.5 29.1
# ... with 4 more variables: mean(MJ_possession_RATE) <dbl>,
# median(MJ_possession_RATE) <dbl>, sd(MJ_possession_RATE) <dbl>,
# na.rm <lgl>
#group_by econ_char
tidy_MJ_Counties %>%
group_by(econ_char) %>%
summarize(mean(MJ_manufact_RATE), median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE)
# A tibble: 6 x 8
econ_char `mean(MJ_manufact~ `median(MJ_manufa~ `sd(MJ_manufact_~
<chr> <dbl> <dbl> <dbl>
1 Commuting 34.4 22.0 44.2
2 Federal Lan~ 70.4 35.2 174.
3 Persistent ~ 48.7 31.2 53.4
4 Retirement ~ 47.7 33.9 56.9
5 Transfers-d~ 45.5 26.7 57.4
6 <NA> 28.1 17.2 39.8
# ... with 4 more variables: mean(MJ_possession_RATE) <dbl>,
# median(MJ_possession_RATE) <dbl>, sd(MJ_possession_RATE) <dbl>,
# na.rm <lgl>
#group_by econ_type and econ_char
tidy_MJ_Counties %>%
group_by(econ_type, econ_char) %>%
summarize(mean(MJ_manufact_RATE), median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE)
# A tibble: 41 x 9
# Groups: econ_type [7]
econ_type econ_char `mean(MJ_manufact~ `median(MJ_manufac~
<chr> <chr> <dbl> <dbl>
1 Farming Depe~ Commuting 47.1 22.1
2 Farming Depe~ Federal Lands 57.3 42.0
3 Farming Depe~ Persistent Po~ 71.7 37.5
4 Farming Depe~ Retirement De~ 65.1 38.0
5 Farming Depe~ Transfers-dep~ 45.6 40.5
6 Farming Depe~ <NA> 56.2 29.1
7 Government D~ Commuting 36.9 28.1
8 Government D~ Federal Lands 160. 31.9
9 Government D~ Persistent Po~ 37.8 25.0
10 Government D~ Retirement De~ 59.0 46.1
# ... with 31 more rows, and 5 more variables:
# sd(MJ_manufact_RATE) <dbl>, mean(MJ_possession_RATE) <dbl>,
# median(MJ_possession_RATE) <dbl>, sd(MJ_possession_RATE) <dbl>,
# na.rm <lgl>
This is revealing, actually. In Government Dependent counties with substantial Federal land, the mean manufacturing arrest rate is absurdly high. Not surprising to me at all given how much time the Forest Service police spent in enforcement. Of course the median is much more reasonable, presumably because there are a few outlier counties (high profile busts, most likely). I also realize looking at this that I’m actually pretty interested in the outliers.
#group_by by econ_char and econ_type
tidy_MJ_Counties %>%
group_by(econ_char, econ_type) %>%
summarize(mean(MJ_manufact_RATE), median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE)
# A tibble: 41 x 9
# Groups: econ_char [6]
econ_char econ_type `mean(MJ_manufact_~ `median(MJ_manufac~
<chr> <chr> <dbl> <dbl>
1 Commuting Farming Depend~ 47.1 22.1
2 Commuting Government Dep~ 36.9 28.1
3 Commuting Manufacturing ~ 29.1 21.3
4 Commuting Mining Depende~ 31.4 31.4
5 Commuting Nonspecialized 33.1 18.6
6 Commuting Services Depen~ 30.0 20.8
7 Commuting <NA> 9.24 9.24
8 Federal La~ Farming Depend~ 57.3 42.0
9 Federal La~ Government Dep~ 160. 31.9
10 Federal La~ Manufacturing ~ 40.0 17.0
# ... with 31 more rows, and 5 more variables:
# sd(MJ_manufact_RATE) <dbl>, mean(MJ_possession_RATE) <dbl>,
# median(MJ_possession_RATE) <dbl>, sd(MJ_possession_RATE) <dbl>,
# na.rm <lgl>
#group_by by econ_char and econ_type then ungroup and arrange in descending order
tidy_MJ_Counties %>%
group_by(econ_type, econ_char) %>%
summarize(mean(MJ_manufact_RATE), medianMJ_manufactRATE = median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE) %>%
ungroup() %>%
arrange(desc(medianMJ_manufactRATE))
# A tibble: 41 x 9
econ_type econ_char `mean(MJ_manufact_~ medianMJ_manufac~
<chr> <chr> <dbl> <dbl>
1 <NA> Federal Lands 119. 100.
2 Manufacturing~ Transfers-dep~ 59.1 52.6
3 Mining Depend~ Retirement De~ 79.5 50.1
4 Government De~ Retirement De~ 59.0 46.1
5 Services Depe~ Transfers-dep~ 79.4 43.8
6 Nonspecialized Federal Lands 45.7 42.5
7 Farming Depen~ Federal Lands 57.3 42.0
8 Government De~ Transfers-dep~ 45.6 40.6
9 Farming Depen~ Transfers-dep~ 45.6 40.5
10 Services Depe~ Persistent Po~ 49.7 38.4
# ... with 31 more rows, and 5 more variables:
# sd(MJ_manufact_RATE) <dbl>, mean(MJ_possession_RATE) <dbl>,
# median(MJ_possession_RATE) <dbl>, sd(MJ_possession_RATE) <dbl>,
# na.rm <lgl>
# A tibble: 41 x 3
econ_char econ_type n
<chr> <chr> <int>
1 Commuting Farming Dependent 21
2 Commuting Government Dependent 20
3 Commuting Manufacturing Dependent 33
4 Commuting Mining Dependent 2
5 Commuting Nonspecialized 43
6 Commuting Services Dependent 12
7 Commuting <NA> 1
8 Federal Lands Farming Dependent 20
9 Federal Lands Government Dependent 20
10 Federal Lands Manufacturing Dependent 18
# ... with 31 more rows
#group_by by state and arrange in descending order by median manufacturing arrest rate
tidy_MJ_Counties %>%
group_by(State.x) %>%
summarize(mean(MJ_manufact_RATE), medianMJ_manufactRATE = median(MJ_manufact_RATE), sd(MJ_manufact_RATE), mean(MJ_possession_RATE), median(MJ_possession_RATE), sd(MJ_possession_RATE), na.rm = TRUE) %>%
ungroup() %>%
arrange(desc(medianMJ_manufactRATE))
# A tibble: 49 x 8
State.x `mean(MJ_manufact_R~ medianMJ_manufactR~ `sd(MJ_manufact_R~
<chr> <dbl> <dbl> <dbl>
1 DC 108. 108. NA
2 AK 335. 100. 575.
3 LA 69.3 52.0 69.8
4 OK 49.6 43.5 45.7
5 AR 66.7 41.5 76.5
6 SC 49.0 41.3 33.4
7 AZ 44.2 39.7 26.6
8 HI 54.9 35.7 57.6
9 NM 49.9 35.0 46.7
10 NV 28.3 34.8 19.6
# ... with 39 more rows, and 4 more variables:
# mean(MJ_possession_RATE) <dbl>, median(MJ_possession_RATE) <dbl>,
# sd(MJ_possession_RATE) <dbl>, na.rm <lgl>
I don’t think that the state level gives as much information as the county level (and even that has limitations).
#filter for farming-dependent counties with manufacture rates higher than the median and possession rates lower than the median
(filter(tidy_MJ_Counties, MJ_manufact_RATE > 20, MJ_possession_RATE < 80, econ_type == "Farming Dependent"))
# A tibble: 61 x 12
FIPS5 POPULATION MJ_manufact_arr~ MJ_manufact_RATE MJ_possession_a~
<chr> <dbl> <dbl> <dbl> <dbl>
1 05049 10372 8 77.1 2
2 05077 15104 24 159. 12
3 05101 8257 2 24.2 6
4 05137 10070 7 69.5 4
5 06021 23684 19 80.2 18
6 06031 88071 20 22.7 49
7 06049 9638 8 83.0 7
8 06069 32604 21 64.4 26
9 08021 7971 4 50.2 0
10 08061 1917 2 104. 0
# ... with 51 more rows, and 7 more variables:
# MJ_possession_RATE <dbl>, State.x <chr>, County Name <chr>,
# 1993 Rural-urban Continuum Code <dbl>, RuralUrban1993 <chr>,
# econ_type <chr>, econ_char <chr>
# 61 observations meet these criteria
#filter for farming-dependent counties with manufacture rates higher than the median and possession rates lower than the median
(filter(tidy_MJ_Counties, MJ_manufact_RATE > 35, MJ_possession_RATE < 112, econ_type == "Farming Dependent"))
# A tibble: 48 x 12
FIPS5 POPULATION MJ_manufact_arr~ MJ_manufact_RATE MJ_possession_a~
<chr> <dbl> <dbl> <dbl> <dbl>
1 01019 19345 11 56.9 17
2 05049 10372 8 77.1 2
3 05077 15104 24 159. 12
4 05137 10070 7 69.5 4
5 05147 10573 10 94.6 11
6 06011 15379 23 150. 17
7 06021 23684 19 80.2 18
8 06049 9638 8 83.0 7
9 06069 32604 21 64.4 26
10 08021 7971 4 50.2 0
# ... with 38 more rows, and 7 more variables:
# MJ_possession_RATE <dbl>, State.x <chr>, County Name <chr>,
# 1993 Rural-urban Continuum Code <dbl>, RuralUrban1993 <chr>,
# econ_type <chr>, econ_char <chr>
# 48 observations meet these criteria
In this visualization, I am plotting the arrest rate for marijuana manufacture on the x axis and arrest rate for possession on the y axis. I’m using color to distinguish among the economic type for each county.
ggplot(data = tidy_MJ_Counties) +
geom_point(mapping = aes(x = MJ_manufact_RATE, y = MJ_possession_RATE, color = econ_type))
I’m going to try visualizing essentially the same data but using facet wrapping.
ggplot(data = tidy_MJ_Counties) +
geom_point(mapping = aes(x = MJ_manufact_RATE, y = MJ_possession_RATE), na.rm = TRUE) +
facet_wrap(~ econ_type, nrow = 2)
And here I’m looking at the manufacturing arrest rate by economic type.
ggplot(data = tidy_MJ_Counties) +
stat_summary(
mapping = aes(x = econ_type, y = MJ_manufact_RATE),
fun.min = min,
fun.max = max,
fun = median
) +
labs( x = "County Predominant Economic Activity", y = "Marijuana Manufacturing Arrest Rate")
Clearly government and farming counties have higher rates for manufacturing arrests. Otherwise this isn’t a very helpful chart because the values cluster at the bottom, eg they are so skewed.
ggplot(data = tidy_MJ_Counties) +
geom_col(mapping = aes(x = econ_type, y = MJ_manufact_RATE)) +
labs( title = "Marijuana Manufacture Arrest Rate by County Economic Type, 1987", x = "County Economic Type", y = "Marijuana Manufacture Arrest Rate")
Things I haven’t figured out:
a) why the bar graph and the scatterplot of the same data looks different (including the units on the y axis?)
b) how to make the labels for each of the categorical variable values on the x axis not overlap.
c) how to construct a visualization of the count of counties of each econ_type.
d) I haven’t looked at whether there’s anything obvious going on with proximity to a metropolitan area. The econ_type codes are all for non-metro counties. 1993 Rural-urban continuum code indicates proximity to metropolitan area.
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Merleaux (2022, March 12). Data Analytics and Computational Social Science: HW4 Merleaux. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httprpubscomamerleaux875768/
BibTeX citation
@misc{merleaux2022hw4, author = {Merleaux, April}, title = {Data Analytics and Computational Social Science: HW4 Merleaux}, url = {https://github.com/DACSS/dacss_course_website/posts/httprpubscomamerleaux875768/}, year = {2022} }