Visualizing a dataset

challenge5
Neha Jhurani
Total_cost_for_top_15_pathogens_2018.xlsx
Author

Neha Jhurani

Published

April 12, 2023

Code
library(tidyverse)
library(ggplot2)

knitr::opts_chunk$set(echo = TRUE)

Total_cost_for_top_15_pathogens_2018

Code
library(readxl)

#reading Total_cost_for_top_15_pathogens_2018 csv data
cost_of_illness_data <- read_excel("_data/Total_cost_for_top_15_pathogens_2018.xlsx")
New names:
• `` -> `...2`
• `` -> `...3`
Code
summary(cost_of_illness_data)
 Total cost of foodborne illness estimates for 15 leading foodborne pathogens
 Length:27                                                                   
 Class :character                                                            
 Mode  :character                                                            
     ...2               ...3          
 Length:27          Length:27         
 Class :character   Class :character  
 Mode  :character   Mode  :character  
Code
#The data shows the cost associated with foodborne diseases in 2018 for top 15 foodborne pathogens causing the disease.

#extracting all the column names
colnames(cost_of_illness_data)
[1] "Total cost of foodborne illness estimates for 15 leading foodborne pathogens"
[2] "...2"                                                                        
[3] "...3"                                                                        
Code
#Changing the column names to make it more informative and effective
colnames(cost_of_illness_data)[1] ="pathogens"
colnames(cost_of_illness_data)[2] ="case"
colnames(cost_of_illness_data)[3] ="cost"
colnames(cost_of_illness_data)
[1] "pathogens" "case"      "cost"     
Code
#Reading the dataset, shows that there are a lot of rows which don't contain any values or are NA, so, I am removing those rows. The last row is the total of the 15 pathogens, so we can remove that as well. 

cost_of_illness_data<- na.omit(cost_of_illness_data)
cost_of_illness_data<- cost_of_illness_data[-16,]

# Making the values numeric so that it can be plotted
cost_of_illness_data$case<-as.numeric(cost_of_illness_data$case)
cost_of_illness_data$cost<-as.numeric(cost_of_illness_data$cost)
cost_of_illness_data
# A tibble: 15 × 3
   pathogens                                                         case   cost
   <chr>                                                            <dbl>  <dbl>
 1 Campylobacter spp. (all species)                                8.45e5 2.18e9
 2 Clostridium perfringens                                         9.66e5 3.84e8
 3 Cryptosporidium spp. (all species)                              5.76e4 5.84e7
 4 Cyclospora cayetanensis                                         1.14e4 2.57e6
 5 Listeria monocytogenes                                          1.59e3 3.19e9
 6 Norovirus                                                       5.46e6 2.57e9
 7 Salmonella (non-typhoidal species)                              1.03e6 4.14e9
 8 Shigella (all species)                                          1.31e5 1.59e8
 9 Shiga toxin-producing Escherichia coli O157 (STEC O157)         6.32e4 3.11e8
10 non-O157 Shiga toxin-producing Escherichia coli (STEC non-O157) 1.13e5 3.17e7
11 Toxoplasma gondii                                               8.67e4 3.74e9
12 Vibrio parahaemolyticus                                         3.47e4 4.57e7
13 Vibrio vulnificus                                               9.6 e1 3.59e8
14 Vibrio non-cholera species other than V. parahaemolyticus and … 1.76e4 8.17e7
15 Yersinia enterocolitica                                         9.77e4 3.13e8
Code
#Univariate Visualizations - Using the cost columns
ggplot(cost_of_illness_data, aes(x=cost)) +
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
ggplot(cost_of_illness_data, aes(x=cost)) +
  geom_histogram()+
  scale_x_continuous(trans = "log10")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Code
# Bivariate Visualization 
#x axis - number of cases, y axis - estimated costs, the points denotes the pathogens associated with it 
ggplot(cost_of_illness_data, aes(case,cost,label=pathogens)) + 
  geom_point(color="blue")+
  geom_text()

Code
ggplot(cost_of_illness_data, aes(x=case, y=cost, label=pathogens)) +
  geom_point(color = "red")+
  scale_x_continuous(trans = "log10", labels = scales::comma)+
  scale_y_continuous(trans = "log10", labels = scales::comma)+
  geom_text()

Code
ggplot(cost_of_illness_data, aes(case, cost)) + geom_boxplot()
Warning: Continuous x aesthetic
ℹ did you forget `aes(group = ...)`?