Blog post 6 with final analysis results as a part of the course “Data Science Fundamentals”
input <- read.csv("C:/Users/gunde/Downloads/Indian_Startup_Funding.csv",stringsAsFactors = TRUE)
startup <- data.frame(input)
glimpse(startup)
Rows: 3,044
Columns: 8
$ Date <fct> 09/01/2020, 13/01/2020, 09/01/2020, 02/01/2~
$ StartupName <fct> BYJUS, Shuttl, Mamaearth, WealthBucket, Fas~
$ IndustryVertical <fct> "E-Tech", "Transportation", "E-commerce", "~
$ SubVertical <fct> "E-learning", "App based shuttle service", ~
$ CityLocation <fct> "Bangalore", "Gurgaon", "Bangalore", "New D~
$ Investors <fct> "Tiger Global Management", "Susquehanna Gro~
$ InvestmentType <fct> Private Equity Round, Series C, Series B, P~
$ AmountUSD <fct> "20,00,00,000", "80,48,394", "1,83,58,860",~
Date StartupName IndustryVertical SubVertical
0 0 0 0
CityLocation Investors InvestmentType AmountUSD
0 0 0 0
class(startup)
[1] "data.frame"
head(startup)
Date StartupName IndustryVertical
1 09/01/2020 BYJUS E-Tech
2 13/01/2020 Shuttl Transportation
3 09/01/2020 Mamaearth E-commerce
4 02/01/2020 WealthBucket FinTech
5 02/01/2020 Fashor Fashion and Apparel
6 13/01/2020 Pando Logistics
SubVertical CityLocation
1 E-learning Bangalore
2 App based shuttle service Gurgaon
3 Retailer of baby and toddler products Bangalore
4 Online Investment New Delhi
5 Embroiled Clothes For Women Mumbai
6 Open-market, freight management platform Chennai
Investors InvestmentType AmountUSD
1 Tiger Global Management Private Equity Round 20,00,00,000
2 Susquehanna Growth Equity Series C 80,48,394
3 Sequoia Capital India Series B 1,83,58,860
4 Vinod Khatumal Pre-series A 30,00,000
5 Sprout Venture Partners Seed Round 18,00,000
6 Chiratae Ventures Series A 90,00,000
dim(startup)
[1] 3044 8
glimpse(startup)
Rows: 3,044
Columns: 8
$ Date <fct> 09/01/2020, 13/01/2020, 09/01/2020, 02/01/2~
$ StartupName <fct> BYJUS, Shuttl, Mamaearth, WealthBucket, Fas~
$ IndustryVertical <fct> "E-Tech", "Transportation", "E-commerce", "~
$ SubVertical <fct> "E-learning", "App based shuttle service", ~
$ CityLocation <fct> "Bangalore", "Gurgaon", "Bangalore", "New D~
$ Investors <fct> "Tiger Global Management", "Susquehanna Gro~
$ InvestmentType <fct> Private Equity Round, Series C, Series B, P~
$ AmountUSD <fct> "20,00,00,000", "80,48,394", "1,83,58,860",~
startup$AmountUSD <- as.numeric(gsub(",","",startup$AmountUSD))
Date StartupName IndustryVertical SubVertical
0 0 0 0
CityLocation Investors InvestmentType AmountUSD
0 0 0 0
## Convert Date variable to Date format
startup$Date <- as.Date(startup$Date,format("%d/%m/%Y"))
## Confirm
str(startup)
'data.frame': 3044 obs. of 8 variables:
$ Date : Date, format: "2020-01-09" ...
$ StartupName : Factor w/ 2453 levels "#Fame","121Policy",..: 276 1911 1308 2307 656 1579 2441 568 295 495 ...
$ IndustryVertical: Factor w/ 878 levels "360-degree view creating platform",..: 200 827 190 275 254 440 358 822 191 7 ...
$ SubVertical : Factor w/ 1943 levels "\"Women\\\\'s Fashion Clothing Online Platform\"",..: 461 75 1647 1279 515 1487 1216 28 112 1681 ...
$ CityLocation : Factor w/ 100 levels "Agra","Ahemadabad",..: 5 29 5 62 55 19 29 74 29 5 ...
$ Investors : Factor w/ 2405 levels " Sandeep Aggarwal, Teruhide Sato",..: 2114 2061 1857 2316 1997 468 218 1827 1523 1365 ...
$ InvestmentType : Factor w/ 55 levels "Angel","Angel / Seed Funding",..: 26 43 41 21 36 40 26 40 44 30 ...
$ AmountUSD : num 2.00e+08 8.05e+06 1.84e+07 3.00e+06 1.80e+06 ...
## Make a new column for year
startup$year <- as.numeric(format(startup$Date,"%Y"))
## Year frequency table
yeartable <- table(startup$year)
yeartable
2015 2016 2017 2018 2019 2020
931 993 687 309 111 7
## Visualization using Bar chart
library(plotrix)
pie(yeartable,labels = yeartable, edges=10, main = "No. of startup's funder each year")
The pie graph shows the no of startups funded to ratio over the year from 2015 to 2020
Ola Cabs Swiggy BYJUS Paytm Medinfi
8 8 7 7 6
ZuperMeal Zuppler Zuver Zwayam Zzungry
1 1 1 1 1
#Creating dataframe
i3 <- data.frame(category = c("Consumer Internet","Technology","E-Commerce","Healthcare"),
count = c(942,478,287,72))
#Calculating percentages
i3$fraction <- i3$count / sum(i3$count)
i3$ymax <- cumsum(i3$fraction)
i3$ymin <- c(0, head(i3$ymax, n=-1))
i3$labelPosition <- (i3$ymax + i3$ymin) / 2
i3$label <- paste0(i3$category, "\n value: ", i3$count)
# Make the plot
ggplot(i3, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
geom_rect() +
geom_label( x=3, aes(y=labelPosition, label=label), size=4) +
scale_fill_brewer(palette=4) +
coord_polar(theta="y") +
xlim(c(1, 5)) +
theme_void()
##Industry vertical frequency table
industrytable_tail <- tail(sort(table(startup$IndustryVertical), decreasing=TRUE),25)
industrytable_tail
Video Games
1
Video Intelligence Platform
1
Video Streaming
1
Viral Content web Platform
1
Virtal
1
Virtual Health consultation app
1
Virtual Reality activity based learning platform
1
Virtual Reality Headset creator
1
virtual reality, 3d simulation and stereoscopic products
1
Visual Blogging platform
1
visual search and discovery platform
1
Voice Call incentivization mobile app
1
Warranty Programs Service Administration
1
Waste Management Solutions
1
Wealth Management Platform
1
Web Content Publishing
1
Wedding Planning Platform
1
Wedding Venues & Vendors Marketplace
1
Weight Management Service
1
WiFi first Cloud communication platform
1
wine
1
Women Ethnic Wear Online Marketplace
1
Women Lifestyle Marketplace
1
Womens Fashion Wear Portal
1
Workforce Management Software
1
## Visualization using Bar chart
set.seed(5642)
sample_data <- data.frame(name = c("Consumer Internet","Technology","E-Commerce","Healthcare","Finance","Logistics","Saas","Education","Food & Beverage","FinTech") ,
value = c(942,478,287,72,63,32,28,25,23,19))
# Create bar plot with labels
plot<-ggplot(sample_data,
aes(name,value)) +
geom_bar(stat = "identity")+ theme_minimal()+
geom_text(aes(label = signif(value)), nudge_y = 3,)
plot+
coord_flip()
This diagram clearly shows us that there are many startups working in the consumer Internet followed by technology, e-commerce, finance, transportation.
If we compare the funding raised it has a very different story all together:
e-commerce start ups are 1/3 in number to consumer internet by they raised almost 4x amounts than consumer internet on average.
The transportation Industry clearly stands out. This clearly explains the supply chain issues.
Cleanstartup <- startup[complete.cases(startup), ]
head(Cleanstartup)
Date StartupName IndustryVertical
1 2020-01-09 BYJUS E-Tech
2 2020-01-13 Shuttl Transportation
3 2020-01-09 Mamaearth E-commerce
4 2020-01-02 WealthBucket FinTech
5 2020-01-02 Fashor Fashion and Apparel
6 2020-01-13 Pando Logistics
SubVertical CityLocation
1 E-learning Bangalore
2 App based shuttle service Gurgaon
3 Retailer of baby and toddler products Bangalore
4 Online Investment New Delhi
5 Embroiled Clothes For Women Mumbai
6 Open-market, freight management platform Chennai
Investors InvestmentType AmountUSD year
1 Tiger Global Management Private Equity Round 200000000 2020
2 Susquehanna Growth Equity Series C 8048394 2020
3 Sequoia Capital India Series B 18358860 2020
4 Vinod Khatumal Pre-series A 3000000 2020
5 Sprout Venture Partners Seed Round 1800000 2020
6 Chiratae Ventures Series A 9000000 2020
dim(Cleanstartup)
[1] 3038 9
summary(Cleanstartup)
Date StartupName IndustryVertical
Min. :2015-01-02 Ola Cabs: 8 Consumer Internet: 941
1st Qu.:2015-11-03 Swiggy : 8 Technology : 478
Median :2016-07-15 BYJUS : 7 E-Commerce : 287
Mean :2016-09-22 Paytm : 7 Healthcare : 72
3rd Qu.:2017-06-12 Medinfi : 6 Finance : 63
Max. :2020-01-13 Meesho : 6 Logistics : 32
(Other) :2996 (Other) :1165
SubVertical CityLocation
nan : 931 Bangalore:863
Online Lending Platform : 11 Mumbai :609
Online Pharmacy : 10 New Delhi:424
Food Delivery Platform : 8 Gurgaon :344
Education : 5 Hyderabad:164
Online Education Platform: 5 Chennai :142
(Other) :2068 (Other) :492
Investors InvestmentType
Undisclosed Investors: 104 Private Equity :1355
Ratan Tata : 25 Seed Funding :1350
Indian Angel Network : 24 Seed/ Angel Funding : 60
Shell Foundation : 21 Seed / Angel Funding: 47
Kalaari Capital : 16 Seed\\\\nFunding : 30
Sequoia Capital : 15 Debt Funding : 25
(Other) :2833 (Other) : 171
AmountUSD year
Min. :1.00e+04 Min. :2015
1st Qu.:1.00e+05 1st Qu.:2015
Median :5.00e+05 Median :2016
Mean :1.26e+07 Mean :2016
3rd Qu.:4.00e+06 3rd Qu.:2017
Max. :3.90e+09 Max. :2020
Cleanstartup1 <- Cleanstartup
str(Cleanstartup1)
'data.frame': 3038 obs. of 9 variables:
$ Date : Date, format: "2020-01-09" ...
$ StartupName : Factor w/ 2453 levels "#Fame","121Policy",..: 276 1911 1308 2307 656 1579 2441 568 295 495 ...
$ IndustryVertical: Factor w/ 878 levels "360-degree view creating platform",..: 200 827 190 275 254 440 358 822 191 7 ...
$ SubVertical : Factor w/ 1943 levels "\"Women\\\\'s Fashion Clothing Online Platform\"",..: 461 75 1647 1279 515 1487 1216 28 112 1681 ...
$ CityLocation : Factor w/ 100 levels "Agra","Ahemadabad",..: 5 29 5 62 55 19 29 74 29 5 ...
$ Investors : Factor w/ 2405 levels " Sandeep Aggarwal, Teruhide Sato",..: 2114 2061 1857 2316 1997 468 218 1827 1523 1365 ...
$ InvestmentType : Factor w/ 55 levels "Angel","Angel / Seed Funding",..: 26 43 41 21 36 40 26 40 44 30 ...
$ AmountUSD : num 2.00e+08 8.05e+06 1.84e+07 3.00e+06 1.80e+06 ...
$ year : num 2020 2020 2020 2020 2020 ...
Cleanstartup1$CityLocation <- as.character(Cleanstartup1$CityLocation)
Cleanstartup1$CityLocation[Cleanstartup1$CityLocation != "Bangalore" & Cleanstartup1$CityLocation != "Mumbai" & Cleanstartup1$CityLocation != "New Delhi" & Cleanstartup1$CityLocation != "Gurgaon" & Cleanstartup1$CityLocation != "Pune" & Cleanstartup1$CityLocation != "Hyderabad"] <- "Others"
head(Cleanstartup1)
Date StartupName IndustryVertical
1 2020-01-09 BYJUS E-Tech
2 2020-01-13 Shuttl Transportation
3 2020-01-09 Mamaearth E-commerce
4 2020-01-02 WealthBucket FinTech
5 2020-01-02 Fashor Fashion and Apparel
6 2020-01-13 Pando Logistics
SubVertical CityLocation
1 E-learning Bangalore
2 App based shuttle service Gurgaon
3 Retailer of baby and toddler products Bangalore
4 Online Investment New Delhi
5 Embroiled Clothes For Women Mumbai
6 Open-market, freight management platform Others
Investors InvestmentType AmountUSD year
1 Tiger Global Management Private Equity Round 200000000 2020
2 Susquehanna Growth Equity Series C 8048394 2020
3 Sequoia Capital India Series B 18358860 2020
4 Vinod Khatumal Pre-series A 3000000 2020
5 Sprout Venture Partners Seed Round 1800000 2020
6 Chiratae Ventures Series A 9000000 2020
table(Cleanstartup1$CityLocation)
Bangalore Gurgaon Hyderabad Mumbai New Delhi Others Pune
863 344 164 609 424 528 106
startup2 <- Cleanstartup1
startup2$IndustryVertical <- as.character(startup2$IndustryVertical)
#clean code
startup2$IndustryVertical[startup2$IndustryVertical != "Consumer Internet" & startup2$IndustryVertical != "Technology" & startup2$IndustryVertical != "ECommerce" & startup2$IndustryVertical != "Logistics" & startup2$IndustryVertical != "Education" & startup2$IndustryVertical != "Healthcare"] <- "OtherSectors"
table(startup2$IndustryVertical)
Consumer Internet Education Healthcare
941 25 72
Logistics OtherSectors Technology
32 1490 478
TopIT <- table(startup2$InvestmentType)
TopIT.df <- as.data.frame(TopIT)
top10IT <- top_n(TopIT.df,10)
top10IT
Var1 Freq
1 Debt Funding 25
2 Private Equity 1355
3 Seed / Angel Funding 47
4 Seed Funding 1350
5 Seed/ Angel Funding 60
6 Seed/Angel Funding 23
7 Seed\\\\nFunding 30
8 Series A 25
9 Series B 21
10 Series C 14
sum(startup2$AmountUSD)
[1] 38277567120
boxplot(startup2$AmountUSD, horizontal = TRUE, xlab = "Amount in USD", main = "Startup Investment plot")
temp = startup2[complete.cases(startup2),] %>% filter(CityLocation == startup2$CityLocation[3])
#Looking for outliers in the data and removing them
outliers = boxplot(temp$AmountUSD ~ temp$IndustryVertical, plot=FALSE)$out
temp.out = temp[-which(temp$AmountUSD %in% outliers),]
# Plotting a boxplot
p2 = ggplot(temp.out, aes(x = reorder(IndustryVertical, AmountUSD), y = log(AmountUSD))) +
geom_boxplot(outlier.shape = NA, show.legend = FALSE) + coord_flip() +
stat_summary(fun = mean, col = "honeydew4", geom = 'point') +
labs(x = "IndustryVerticle", y = "Amount in USD(Millions)", title = "Plotting outliers per Top5 IndustryVerticle on average", subtitle = "average given by dot")
p2
boxplot(startup2$AmountUSD ~ startup2$InvestmentType, horizontal = FALSE, xlab= "Amount of Investment in USD", ylab = "Year", main = "Year Wise Investment Analysis")
boxplot(startup2$SubVertical ~ startup2$CityLocation,horizontal = TRUE, ylab= "Amount of Investment in USD", xlab = "Industry Sector", main = "Industrial Sector Wise Investment Analysis", boxwex = 0.5)
mytable <- xtabs(startup2$year ~ startup2$IndustryVertical)
boxplot.default(mytable, ylab = "AMount of Investment", xlab = "Industry Sector")
boxplot(startup$IndustryVertical ~ startup$CityLocation,horizontal = TRUE, ylab= "Amount of Investment in USD", xlab = "City", main = "Startup citiwise location Wise Investment Analysis", boxwex = 3)
boxplot(startup$AmountUSD ~ startup$InvestmentType,horizontal = FALSE, ylab= "Amount of Investment in USD", xlab = "Investment Type", main = "Investment type vs Investment Analysis")
plot(mytable, ylab = "AMount of Investment", xlab = "Type of Investment")
b = barplot(head(sort(table(startup$CityLocation), decreasing=T),20),col=rainbow(10,0.5), las=2, ylim=c(0,750), xlab="City Name", ylab="No Of StartUps")
text(b,head(sort(table(startup$CityLocation), decreasing=T),20),head(sort(table(startup$CityLocation), decreasing=T),20),srt=90, pos=4)
##Investor frequency table
Investortable <- head(sort(table(startup2$Investors), decreasing=TRUE))
Investortable
Undisclosed Investors Ratan Tata Indian Angel Network
104 25 24
Shell Foundation Kalaari Capital Sequoia Capital
21 16 15
## Visualization using Bar chart
barplot(Investortable, xlab="Investor Name", ylab = "No. of Startups which received funding", col="lightblue",cex.names = 0.5 )
b = barplot(head(sort(table(startup2$InvestmentType), decreasing=T),20),col=rainbow(10,0.5), las=2, ylim=c(1,1500), xlab="InvestmentType", ylab="Freqency of Investments")
text(b,head(sort(table(startup2$InvestmentType), decreasing=T),20),head(sort(table(startup2$CityLocation), decreasing=T),20),srt=90, pos=4)
#generalize the case to prevent misreading on multiple elements with different case
startup2$IndustryVertical <- as.factor(tolower(startup2$IndustryVertical))
#Total amount funded
fund5 <- as.data.frame(aggregate(AmountUSD ~ IndustryVertical, startup, sum))
fund5 <- fund5[order(fund5$AmountUSD, decreasing = TRUE), ] %>%
drop_na(IndustryVertical) %>%
head(5)
fund5
IndustryVertical AmountUSD
1 E-Commerce 8189518685
2 Consumer Internet 6323472695
3 Transportation 3916632394
4 Technology 2267804310
5 Finance 1981978000
ggplot(data = fund5, aes(IndustryVertical, AmountUSD, fill = IndustryVertical)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(labels = unit_format(unit = "M", scale = 0.0000001)) +
labs(x = "Industry Category", y = "Investment Amount (USD)", title = "TOP 5 Start-up Investment in India") +
theme(legend.position = "none")
ggplot(startup2, aes(x=AmountUSD, y = year, color = IndustryVertical )) + geom_point() +
scale_x_log10() +
facet_wrap(~CityLocation)
The graph shows the distribution of start ups growth in various cities over the years from 2015 to 2020
temp=startup %>% select(StartupName,Investors,AmountUSD) %>% ddply(.(StartupName),summarise,sum=sum(AmountUSD)) %>% arrange(desc(sum))
kable(head(temp,25),"html") %>% kable_styling("striped",full_width=T) %>% column_spec(1:2,bold=T,background="white") %>% row_spec(c(1,2,3,5,6,24,25),bold=F,color="lightblue",background="Green")
StartupName | sum |
---|---|
Flipkart | 4059700000 |
Rapido Bike Taxi | 3900000000 |
Paytm | 3149050000 |
Ola | 984500000 |
Udaan | 870000000 |
Flipkart.com | 700000000 |
Snapdeal | 700000000 |
Ola Cabs | 669725000 |
True North | 600000000 |
BYJUS | 525240000 |
BigBasket | 507000000 |
GOQii | 450331046 |
Zomato | 435000000 |
Olacabs | 400000000 |
Oyo Rooms | 350000000 |
Automation Anywhere | 300000000 |
Grofers | 297000000 |
OYO Rooms | 285000000 |
Vogo Automotive | 283000000 |
Swiggy | 270500000 |
Edelweiss | 270000000 |
PolicyBazaar | 267700000 |
Zilingo | 235900000 |
Lenskart.com | 231000000 |
Quikr | 230000000 |
Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Gundeti (2022, May 19). Data Analytics and Computational Social Science: Startup . Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomrahulgdacss601hw6/
BibTeX citation
@misc{gundeti2022startup, author = {Gundeti, Rahul}, title = {Data Analytics and Computational Social Science: Startup }, url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomrahulgdacss601hw6/}, year = {2022} }