Startup

Blog post 6 with final analysis results as a part of the course “Data Science Fundamentals”

Rahul Gundeti (Graduate student, Data Analytics & Computational Social Sciences (DACSS), UMass Amherst.)
2022-05-12
input <- read.csv("C:/Users/gunde/Downloads/Indian_Startup_Funding.csv",stringsAsFactors = TRUE)
startup <- data.frame(input)
glimpse(startup)
Rows: 3,044
Columns: 8
$ Date             <fct> 09/01/2020, 13/01/2020, 09/01/2020, 02/01/2~
$ StartupName      <fct> BYJUS, Shuttl, Mamaearth, WealthBucket, Fas~
$ IndustryVertical <fct> "E-Tech", "Transportation", "E-commerce", "~
$ SubVertical      <fct> "E-learning", "App based shuttle service", ~
$ CityLocation     <fct> "Bangalore", "Gurgaon", "Bangalore", "New D~
$ Investors        <fct> "Tiger Global Management", "Susquehanna Gro~
$ InvestmentType   <fct> Private Equity Round, Series C, Series B, P~
$ AmountUSD        <fct> "20,00,00,000", "80,48,394", "1,83,58,860",~
#check NA availability for each column
colSums(is.na(startup))
            Date      StartupName IndustryVertical      SubVertical 
               0                0                0                0 
    CityLocation        Investors   InvestmentType        AmountUSD 
               0                0                0                0 
class(startup)
[1] "data.frame"
head(startup)
        Date  StartupName    IndustryVertical
1 09/01/2020        BYJUS              E-Tech
2 13/01/2020       Shuttl      Transportation
3 09/01/2020    Mamaearth          E-commerce
4 02/01/2020 WealthBucket             FinTech
5 02/01/2020       Fashor Fashion and Apparel
6 13/01/2020        Pando           Logistics
                               SubVertical CityLocation
1                               E-learning    Bangalore
2                App based shuttle service      Gurgaon
3    Retailer of baby and toddler products    Bangalore
4                        Online Investment    New Delhi
5              Embroiled Clothes For Women       Mumbai
6 Open-market, freight management platform      Chennai
                  Investors       InvestmentType    AmountUSD
1   Tiger Global Management Private Equity Round 20,00,00,000
2 Susquehanna Growth Equity             Series C    80,48,394
3     Sequoia Capital India             Series B  1,83,58,860
4            Vinod Khatumal         Pre-series A    30,00,000
5   Sprout Venture Partners           Seed Round    18,00,000
6         Chiratae Ventures             Series A    90,00,000
dim(startup)
[1] 3044    8
glimpse(startup)
Rows: 3,044
Columns: 8
$ Date             <fct> 09/01/2020, 13/01/2020, 09/01/2020, 02/01/2~
$ StartupName      <fct> BYJUS, Shuttl, Mamaearth, WealthBucket, Fas~
$ IndustryVertical <fct> "E-Tech", "Transportation", "E-commerce", "~
$ SubVertical      <fct> "E-learning", "App based shuttle service", ~
$ CityLocation     <fct> "Bangalore", "Gurgaon", "Bangalore", "New D~
$ Investors        <fct> "Tiger Global Management", "Susquehanna Gro~
$ InvestmentType   <fct> Private Equity Round, Series C, Series B, P~
$ AmountUSD        <fct> "20,00,00,000", "80,48,394", "1,83,58,860",~
startup$AmountUSD <- as.numeric(gsub(",","",startup$AmountUSD))
colSums(is.na(startup))
            Date      StartupName IndustryVertical      SubVertical 
               0                0                0                0 
    CityLocation        Investors   InvestmentType        AmountUSD 
               0                0                0                0 
## Convert Date variable to Date format
startup$Date <- as.Date(startup$Date,format("%d/%m/%Y"))
## Confirm
str(startup)
'data.frame':   3044 obs. of  8 variables:
 $ Date            : Date, format: "2020-01-09" ...
 $ StartupName     : Factor w/ 2453 levels "#Fame","121Policy",..: 276 1911 1308 2307 656 1579 2441 568 295 495 ...
 $ IndustryVertical: Factor w/ 878 levels "360-degree view creating platform",..: 200 827 190 275 254 440 358 822 191 7 ...
 $ SubVertical     : Factor w/ 1943 levels "\"Women\\\\'s Fashion Clothing Online Platform\"",..: 461 75 1647 1279 515 1487 1216 28 112 1681 ...
 $ CityLocation    : Factor w/ 100 levels "Agra","Ahemadabad",..: 5 29 5 62 55 19 29 74 29 5 ...
 $ Investors       : Factor w/ 2405 levels " Sandeep Aggarwal, Teruhide Sato",..: 2114 2061 1857 2316 1997 468 218 1827 1523 1365 ...
 $ InvestmentType  : Factor w/ 55 levels "Angel","Angel / Seed Funding",..: 26 43 41 21 36 40 26 40 44 30 ...
 $ AmountUSD       : num  2.00e+08 8.05e+06 1.84e+07 3.00e+06 1.80e+06 ...
## Make a new column for year
startup$year <- as.numeric(format(startup$Date,"%Y"))
## Year frequency table
yeartable <- table(startup$year)
yeartable

2015 2016 2017 2018 2019 2020 
 931  993  687  309  111    7 
## Visualization using Bar chart
library(plotrix)
pie(yeartable,labels = yeartable, edges=10, main = "No. of startup's funder each year")

The pie graph shows the no of startups funded to ratio over the year from 2015 to 2020

head(sort(table(startup$StartupName), decreasing=TRUE),5)

Ola Cabs   Swiggy    BYJUS    Paytm  Medinfi 
       8        8        7        7        6 
tail(sort(table(startup$StartupName), decreasing=TRUE),5)

ZuperMeal   Zuppler     Zuver    Zwayam   Zzungry 
        1         1         1         1         1 
#Creating dataframe
i3 <- data.frame(category = c("Consumer Internet","Technology","E-Commerce","Healthcare"),
                          count = c(942,478,287,72))
#Calculating percentages
i3$fraction <- i3$count / sum(i3$count)
i3$ymax <- cumsum(i3$fraction)
i3$ymin <- c(0, head(i3$ymax, n=-1))
i3$labelPosition <- (i3$ymax + i3$ymin) / 2
i3$label <- paste0(i3$category, "\n value: ", i3$count)

# Make the plot
ggplot(i3, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
  geom_rect() +
  geom_label( x=3, aes(y=labelPosition, label=label), size=4) +
  scale_fill_brewer(palette=4) +
  coord_polar(theta="y") +
  xlim(c(1, 5)) +
  theme_void()

##Industry vertical frequency table
industrytable_tail <- tail(sort(table(startup$IndustryVertical), decreasing=TRUE),25)
industrytable_tail

                                             Video Games 
                                                       1 
                             Video Intelligence Platform 
                                                       1 
                                         Video Streaming 
                                                       1 
                              Viral Content web Platform 
                                                       1 
                                                 Virtal  
                                                       1 
                         Virtual Health consultation app 
                                                       1 
        Virtual Reality activity based learning platform 
                                                       1 
                         Virtual Reality Headset creator 
                                                       1 
virtual reality, 3d simulation and stereoscopic products 
                                                       1 
                                Visual Blogging platform 
                                                       1 
                    visual search and discovery platform 
                                                       1 
                   Voice Call incentivization mobile app 
                                                       1 
                Warranty Programs Service Administration 
                                                       1 
                              Waste Management Solutions 
                                                       1 
                              Wealth Management Platform 
                                                       1 
                                  Web Content Publishing 
                                                       1 
                               Wedding Planning Platform 
                                                       1 
                    Wedding Venues & Vendors Marketplace 
                                                       1 
                               Weight Management Service 
                                                       1 
                 WiFi first Cloud communication platform 
                                                       1 
                                                    wine 
                                                       1 
                    Women Ethnic Wear Online Marketplace 
                                                       1 
                             Women Lifestyle Marketplace 
                                                       1 
                              Womens Fashion Wear Portal 
                                                       1 
                           Workforce Management Software 
                                                       1 
## Visualization using Bar chart
set.seed(5642)                            
sample_data <- data.frame(name = c("Consumer Internet","Technology","E-Commerce","Healthcare","Finance","Logistics","Saas","Education","Food & Beverage","FinTech") ,
                          value = c(942,478,287,72,63,32,28,25,23,19))

# Create bar plot with labels
plot<-ggplot(sample_data,
             aes(name,value)) +
geom_bar(stat = "identity")+ theme_minimal()+
geom_text(aes(label = signif(value)), nudge_y = 3,)
plot+
coord_flip()

This diagram clearly shows us that there are many startups working in the consumer Internet followed by technology, e-commerce, finance, transportation.

If we compare the funding raised it has a very different story all together:

e-commerce start ups are 1/3 in number to consumer internet by they raised almost 4x amounts than consumer internet on average.

The transportation Industry clearly stands out. This clearly explains the supply chain issues.

Cleanstartup <- startup[complete.cases(startup), ]
head(Cleanstartup)
        Date  StartupName    IndustryVertical
1 2020-01-09        BYJUS              E-Tech
2 2020-01-13       Shuttl      Transportation
3 2020-01-09    Mamaearth          E-commerce
4 2020-01-02 WealthBucket             FinTech
5 2020-01-02       Fashor Fashion and Apparel
6 2020-01-13        Pando           Logistics
                               SubVertical CityLocation
1                               E-learning    Bangalore
2                App based shuttle service      Gurgaon
3    Retailer of baby and toddler products    Bangalore
4                        Online Investment    New Delhi
5              Embroiled Clothes For Women       Mumbai
6 Open-market, freight management platform      Chennai
                  Investors       InvestmentType AmountUSD year
1   Tiger Global Management Private Equity Round 200000000 2020
2 Susquehanna Growth Equity             Series C   8048394 2020
3     Sequoia Capital India             Series B  18358860 2020
4            Vinod Khatumal         Pre-series A   3000000 2020
5   Sprout Venture Partners           Seed Round   1800000 2020
6         Chiratae Ventures             Series A   9000000 2020
dim(Cleanstartup)
[1] 3038    9
summary(Cleanstartup)
      Date              StartupName            IndustryVertical
 Min.   :2015-01-02   Ola Cabs:   8   Consumer Internet: 941   
 1st Qu.:2015-11-03   Swiggy  :   8   Technology       : 478   
 Median :2016-07-15   BYJUS   :   7   E-Commerce       : 287   
 Mean   :2016-09-22   Paytm   :   7   Healthcare       :  72   
 3rd Qu.:2017-06-12   Medinfi :   6   Finance          :  63   
 Max.   :2020-01-13   Meesho  :   6   Logistics        :  32   
                      (Other) :2996   (Other)          :1165   
                    SubVertical      CityLocation
 nan                      : 931   Bangalore:863  
 Online Lending Platform  :  11   Mumbai   :609  
 Online Pharmacy          :  10   New Delhi:424  
 Food Delivery Platform   :   8   Gurgaon  :344  
 Education                :   5   Hyderabad:164  
 Online Education Platform:   5   Chennai  :142  
 (Other)                  :2068   (Other)  :492  
                 Investors                 InvestmentType
 Undisclosed Investors: 104   Private Equity      :1355  
 Ratan Tata           :  25   Seed Funding        :1350  
 Indian Angel Network :  24   Seed/ Angel Funding :  60  
 Shell Foundation     :  21   Seed / Angel Funding:  47  
 Kalaari Capital      :  16   Seed\\\\nFunding    :  30  
 Sequoia Capital      :  15   Debt Funding        :  25  
 (Other)              :2833   (Other)             : 171  
   AmountUSD             year     
 Min.   :1.00e+04   Min.   :2015  
 1st Qu.:1.00e+05   1st Qu.:2015  
 Median :5.00e+05   Median :2016  
 Mean   :1.26e+07   Mean   :2016  
 3rd Qu.:4.00e+06   3rd Qu.:2017  
 Max.   :3.90e+09   Max.   :2020  
                                  
Cleanstartup1 <- Cleanstartup
str(Cleanstartup1)
'data.frame':   3038 obs. of  9 variables:
 $ Date            : Date, format: "2020-01-09" ...
 $ StartupName     : Factor w/ 2453 levels "#Fame","121Policy",..: 276 1911 1308 2307 656 1579 2441 568 295 495 ...
 $ IndustryVertical: Factor w/ 878 levels "360-degree view creating platform",..: 200 827 190 275 254 440 358 822 191 7 ...
 $ SubVertical     : Factor w/ 1943 levels "\"Women\\\\'s Fashion Clothing Online Platform\"",..: 461 75 1647 1279 515 1487 1216 28 112 1681 ...
 $ CityLocation    : Factor w/ 100 levels "Agra","Ahemadabad",..: 5 29 5 62 55 19 29 74 29 5 ...
 $ Investors       : Factor w/ 2405 levels " Sandeep Aggarwal, Teruhide Sato",..: 2114 2061 1857 2316 1997 468 218 1827 1523 1365 ...
 $ InvestmentType  : Factor w/ 55 levels "Angel","Angel / Seed Funding",..: 26 43 41 21 36 40 26 40 44 30 ...
 $ AmountUSD       : num  2.00e+08 8.05e+06 1.84e+07 3.00e+06 1.80e+06 ...
 $ year            : num  2020 2020 2020 2020 2020 ...
Cleanstartup1$CityLocation <- as.character(Cleanstartup1$CityLocation)
Cleanstartup1$CityLocation[Cleanstartup1$CityLocation != "Bangalore" & Cleanstartup1$CityLocation != "Mumbai" & Cleanstartup1$CityLocation != "New Delhi" & Cleanstartup1$CityLocation != "Gurgaon" & Cleanstartup1$CityLocation != "Pune" & Cleanstartup1$CityLocation != "Hyderabad"] <- "Others"


head(Cleanstartup1)
        Date  StartupName    IndustryVertical
1 2020-01-09        BYJUS              E-Tech
2 2020-01-13       Shuttl      Transportation
3 2020-01-09    Mamaearth          E-commerce
4 2020-01-02 WealthBucket             FinTech
5 2020-01-02       Fashor Fashion and Apparel
6 2020-01-13        Pando           Logistics
                               SubVertical CityLocation
1                               E-learning    Bangalore
2                App based shuttle service      Gurgaon
3    Retailer of baby and toddler products    Bangalore
4                        Online Investment    New Delhi
5              Embroiled Clothes For Women       Mumbai
6 Open-market, freight management platform       Others
                  Investors       InvestmentType AmountUSD year
1   Tiger Global Management Private Equity Round 200000000 2020
2 Susquehanna Growth Equity             Series C   8048394 2020
3     Sequoia Capital India             Series B  18358860 2020
4            Vinod Khatumal         Pre-series A   3000000 2020
5   Sprout Venture Partners           Seed Round   1800000 2020
6         Chiratae Ventures             Series A   9000000 2020
table(Cleanstartup1$CityLocation)

Bangalore   Gurgaon Hyderabad    Mumbai New Delhi    Others      Pune 
      863       344       164       609       424       528       106 
startup2 <- Cleanstartup1
startup2$IndustryVertical <- as.character(startup2$IndustryVertical)
#clean code
startup2$IndustryVertical[startup2$IndustryVertical != "Consumer Internet" & startup2$IndustryVertical != "Technology" & startup2$IndustryVertical != "ECommerce" & startup2$IndustryVertical != "Logistics" & startup2$IndustryVertical != "Education" & startup2$IndustryVertical != "Healthcare"] <- "OtherSectors"

table(startup2$IndustryVertical)

Consumer Internet         Education        Healthcare 
              941                25                72 
        Logistics      OtherSectors        Technology 
               32              1490               478 
TopIT <- table(startup2$InvestmentType)
TopIT.df <- as.data.frame(TopIT)
top10IT <- top_n(TopIT.df,10)
top10IT
                   Var1 Freq
1          Debt Funding   25
2        Private Equity 1355
3  Seed / Angel Funding   47
4          Seed Funding 1350
5   Seed/ Angel Funding   60
6    Seed/Angel Funding   23
7      Seed\\\\nFunding   30
8              Series A   25
9              Series B   21
10             Series C   14
sum(startup2$AmountUSD)
[1] 38277567120
boxplot(startup2$AmountUSD, horizontal = TRUE, xlab = "Amount in USD", main = "Startup Investment plot")

temp = startup2[complete.cases(startup2),] %>% filter(CityLocation == startup2$CityLocation[3])

#Looking for outliers in the data and removing them
outliers = boxplot(temp$AmountUSD ~ temp$IndustryVertical, plot=FALSE)$out
temp.out = temp[-which(temp$AmountUSD %in% outliers),]

# Plotting a boxplot 
p2 = ggplot(temp.out, aes(x = reorder(IndustryVertical, AmountUSD), y = log(AmountUSD))) +
  geom_boxplot(outlier.shape = NA, show.legend = FALSE) + coord_flip() +
  stat_summary(fun = mean, col = "honeydew4", geom = 'point') +
  labs(x = "IndustryVerticle", y = "Amount in USD(Millions)", title = "Plotting outliers per Top5 IndustryVerticle on average", subtitle = "average given by dot")
p2

boxplot(startup2$AmountUSD ~ startup2$InvestmentType, horizontal = FALSE, xlab= "Amount of Investment in USD", ylab = "Year", main = "Year Wise Investment Analysis")

boxplot(startup2$SubVertical ~ startup2$CityLocation,horizontal = TRUE, ylab= "Amount of Investment in USD", xlab = "Industry Sector", main = "Industrial Sector Wise Investment Analysis", boxwex = 0.5)

mytable <- xtabs(startup2$year ~ startup2$IndustryVertical)
boxplot.default(mytable, ylab = "AMount of Investment", xlab = "Industry Sector")

boxplot(startup$IndustryVertical ~ startup$CityLocation,horizontal = TRUE, ylab= "Amount of Investment in USD", xlab = "City", main = "Startup citiwise location Wise Investment Analysis", boxwex = 3)

boxplot(startup$AmountUSD ~ startup$InvestmentType,horizontal = FALSE, ylab= "Amount of Investment in USD", xlab = "Investment Type", main = "Investment type vs Investment Analysis")

plot(mytable, ylab = "AMount of Investment", xlab = "Type of Investment")

b = barplot(head(sort(table(startup$CityLocation), decreasing=T),20),col=rainbow(10,0.5), las=2, ylim=c(0,750), xlab="City Name", ylab="No Of StartUps")
text(b,head(sort(table(startup$CityLocation), decreasing=T),20),head(sort(table(startup$CityLocation), decreasing=T),20),srt=90, pos=4)

##Investor frequency table
Investortable <- head(sort(table(startup2$Investors), decreasing=TRUE))
Investortable

Undisclosed Investors            Ratan Tata  Indian Angel Network 
                  104                    25                    24 
     Shell Foundation       Kalaari Capital       Sequoia Capital 
                   21                    16                    15 
## Visualization using Bar chart
barplot(Investortable, xlab="Investor Name", ylab = "No. of Startups which received funding", col="lightblue",cex.names = 0.5 )

b = barplot(head(sort(table(startup2$InvestmentType), decreasing=T),20),col=rainbow(10,0.5), las=2, ylim=c(1,1500), xlab="InvestmentType", ylab="Freqency of Investments")
text(b,head(sort(table(startup2$InvestmentType), decreasing=T),20),head(sort(table(startup2$CityLocation), decreasing=T),20),srt=90, pos=4)

#generalize the case to prevent misreading on multiple elements with different case
startup2$IndustryVertical <- as.factor(tolower(startup2$IndustryVertical))

#Total amount funded
fund5 <- as.data.frame(aggregate(AmountUSD ~ IndustryVertical, startup, sum))
fund5 <- fund5[order(fund5$AmountUSD, decreasing = TRUE), ] %>% 
  drop_na(IndustryVertical) %>%
  head(5)
fund5
   IndustryVertical  AmountUSD
1        E-Commerce 8189518685
2 Consumer Internet 6323472695
3    Transportation 3916632394
4        Technology 2267804310
5           Finance 1981978000
ggplot(data = fund5, aes(IndustryVertical, AmountUSD, fill = IndustryVertical)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_continuous(labels = unit_format(unit = "M", scale = 0.0000001)) +
  labs(x = "Industry Category", y = "Investment Amount (USD)", title = "TOP 5 Start-up Investment in India") +
  theme(legend.position = "none")

ggplot(startup2, aes(x=AmountUSD, y = year, color = IndustryVertical )) + geom_point() +
scale_x_log10() +
facet_wrap(~CityLocation)

The graph shows the distribution of start ups growth in various cities over the years from 2015 to 2020

temp=startup %>% select(StartupName,Investors,AmountUSD) %>% ddply(.(StartupName),summarise,sum=sum(AmountUSD)) %>% arrange(desc(sum))
kable(head(temp,25),"html") %>% kable_styling("striped",full_width=T) %>% column_spec(1:2,bold=T,background="white") %>% row_spec(c(1,2,3,5,6,24,25),bold=F,color="lightblue",background="Green")
StartupName sum
Flipkart 4059700000
Rapido Bike Taxi 3900000000
Paytm 3149050000
Ola 984500000
Udaan 870000000
Flipkart.com 700000000
Snapdeal 700000000
Ola Cabs 669725000
True North 600000000
BYJUS 525240000
BigBasket 507000000
GOQii 450331046
Zomato 435000000
Olacabs 400000000
Oyo Rooms 350000000
Automation Anywhere 300000000
Grofers 297000000
OYO Rooms 285000000
Vogo Automotive 283000000
Swiggy 270500000
Edelweiss 270000000
PolicyBazaar 267700000
Zilingo 235900000
Lenskart.com 231000000
Quikr 230000000

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY-NC 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Gundeti (2022, May 19). Data Analytics and Computational Social Science: Startup . Retrieved from https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomrahulgdacss601hw6/

BibTeX citation

@misc{gundeti2022startup,
  author = {Gundeti, Rahul},
  title = {Data Analytics and Computational Social Science: Startup },
  url = {https://github.com/DACSS/dacss_course_website/posts/httpsrpubscomrahulgdacss601hw6/},
  year = {2022}
}