Blog post 5 performing more analysis on the dataset as a part of the course “Data Science Fundamentals”
id <- data.frame(read.csv("C:/Users/gunde/Downloads/Indian_Startup_Funding.csv",stringsAsFactors = TRUE))
glimpse(id)
Rows: 3,044
Columns: 8
$ Date <fct> 09/01/2020, 13/01/2020, 09/01/2020, 02/01/2~
$ StartupName <fct> BYJUS, Shuttl, Mamaearth, WealthBucket, Fas~
$ IndustryVertical <fct> "E-Tech", "Transportation", "E-commerce", "~
$ SubVertical <fct> "E-learning", "App based shuttle service", ~
$ CityLocation <fct> "Bangalore", "Gurgaon", "Bangalore", "New D~
$ Investors <fct> "Tiger Global Management", "Susquehanna Gro~
$ InvestmentType <fct> Private Equity Round, Series C, Series B, P~
$ AmountUSD <fct> "20,00,00,000", "80,48,394", "1,83,58,860",~
## Convert Date variable to Date format
id$Date <- as.Date(id$Date,format("%d/%m/%Y"))
## Confirm
str(id)
'data.frame': 3044 obs. of 8 variables:
$ Date : Date, format: "2020-01-09" ...
$ StartupName : Factor w/ 2453 levels "#Fame","121Policy",..: 276 1911 1308 2307 656 1579 2441 568 295 495 ...
$ IndustryVertical: Factor w/ 878 levels "360-degree view creating platform",..: 200 827 190 275 254 440 358 822 191 7 ...
$ SubVertical : Factor w/ 1943 levels "\"Women\\\\'s Fashion Clothing Online Platform\"",..: 461 75 1647 1279 515 1487 1216 28 112 1681 ...
$ CityLocation : Factor w/ 100 levels "Agra","Ahemadabad",..: 5 29 5 62 55 19 29 74 29 5 ...
$ Investors : Factor w/ 2405 levels " Sandeep Aggarwal, Teruhide Sato",..: 2114 2061 1857 2316 1997 468 218 1827 1523 1365 ...
$ InvestmentType : Factor w/ 55 levels "Angel","Angel / Seed Funding",..: 26 43 41 21 36 40 26 40 44 30 ...
$ AmountUSD : Factor w/ 476 levels "1,00,00,00,000",..: 184 451 75 263 129 469 115 386 406 336 ...
## Make a new column for year
id$year <- as.numeric(format(id$Date,"%Y"))
## Year frequency table
yeartable <- table(id$year)
Cleanstartup <- id[complete.cases(id), ]
dim(Cleanstartup)
[1] 3038 9
Cleanstartup1 <- Cleanstartup
str(Cleanstartup1)
'data.frame': 3038 obs. of 9 variables:
$ Date : Date, format: "2020-01-09" ...
$ StartupName : Factor w/ 2453 levels "#Fame","121Policy",..: 276 1911 1308 2307 656 1579 2441 568 295 495 ...
$ IndustryVertical: Factor w/ 878 levels "360-degree view creating platform",..: 200 827 190 275 254 440 358 822 191 7 ...
$ SubVertical : Factor w/ 1943 levels "\"Women\\\\'s Fashion Clothing Online Platform\"",..: 461 75 1647 1279 515 1487 1216 28 112 1681 ...
$ CityLocation : Factor w/ 100 levels "Agra","Ahemadabad",..: 5 29 5 62 55 19 29 74 29 5 ...
$ Investors : Factor w/ 2405 levels " Sandeep Aggarwal, Teruhide Sato",..: 2114 2061 1857 2316 1997 468 218 1827 1523 1365 ...
$ InvestmentType : Factor w/ 55 levels "Angel","Angel / Seed Funding",..: 26 43 41 21 36 40 26 40 44 30 ...
$ AmountUSD : Factor w/ 476 levels "1,00,00,00,000",..: 184 451 75 263 129 469 115 386 406 336 ...
$ year : num 2020 2020 2020 2020 2020 ...
Cleanstartup1$CityLocation <- as.character(Cleanstartup1$CityLocation)
Cleanstartup1$CityLocation[Cleanstartup1$CityLocation != "Bangalore" & Cleanstartup1$CityLocation != "Mumbai" & Cleanstartup1$CityLocation != "New Delhi" & Cleanstartup1$CityLocation != "Gurgaon" & Cleanstartup1$CityLocation != "Pune" & Cleanstartup1$CityLocation != "Hyderabad"] <- "Others"
startup2 <- Cleanstartup1
startup2$IndustryVertical <- as.character(startup2$IndustryVertical)
#clean code
startup2$IndustryVertical[startup2$IndustryVertical != "Consumer Internet" & startup2$IndustryVertical != "Technology" & startup2$IndustryVertical != "E-Commerce" & startup2$IndustryVertical != "Finance" & startup2$IndustryVertical != "Transportation" ] <- "OtherSectors"
table(startup2$IndustryVertical)
Consumer Internet E-Commerce Finance
941 287 63
OtherSectors Technology Transportation
1265 478 4
TopIT <- table(startup2$InvestmentType)
TopIT.df <- as.data.frame(TopIT)
top10IT <- top_n(TopIT.df,10)
top10IT
Var1 Freq
1 Debt Funding 25
2 Private Equity 1355
3 Seed / Angel Funding 47
4 Seed Funding 1350
5 Seed/ Angel Funding 60
6 Seed/Angel Funding 23
7 Seed\\\\nFunding 30
8 Series A 25
9 Series B 21
10 Series C 14
boxplot(startup2$AmountUSD, horizontal = TRUE, xlab = "Amount in USD", main = "Startup Investment plot")
boxplot(startup2$AmountUSD ~ Cleanstartup$InvestmentType, horizontal = FALSE, xlab= "Amount of Investment in USD", ylab = "Amount", main = "Investment Type")
boxplot(startup2$InvestmentType ~ Cleanstartup$Investors,horizontal = TRUE, ylab= "Investors", xlab = "Industry Sector", main = "Amount of Investment", boxwex = 0.25)
mytable <- xtabs(startup2$year ~ Cleanstartup$AmountUSD)
boxplot.default(mytable, ylab = "Amount in USD", xlab = "Industry Sector", main = "Year to Amount")
x <- xtabs(startup2$year ~ startup2$IndustryVertical)
plot(x, ylab = "AMount of Investment", xlab = "IndustryVertical")
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Gundeti (2022, May 19). Data Analytics and Computational Social Science: 601_blogpost5. Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomrahulgdacss601hw5/
BibTeX citation
@misc{gundeti2022601_blogpost5, author = {Gundeti, Rahul}, title = {Data Analytics and Computational Social Science: 601_blogpost5}, url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomrahulgdacss601hw5/}, year = {2022} }