Final Project Assignment#2 : Exploratory Analysis and Visualization: Project & Data Description

final_Project_assignment_2

final_project_exploratory_analysis

Exploratory Analysis and Visualization

Author

Pradhakshya Dhanakumar

Published

May 1, 2023

install.packages("magrittr")

Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

install.packages("plotly")

Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

install.packages("ggplot2")

Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

library(magrittr)
library(plotly)

Loading required package: ggplot2


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Understanding Data:

spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv', show_col_types = FALSE)
View(spotify_songs)

Error in check_for_XQuartz(file.path(R.home("modules"), "R_de.so")): X11 library is missing: install XQuartz from www.xquartz.org

dim(spotify_songs)

[1] 32833    23

tail(spotify_songs,5)

summary(spotify_songs)

   track_id          track_name        track_artist       track_popularity
 Length:32833       Length:32833       Length:32833       Min.   :  0.00  
 Class :character   Class :character   Class :character   1st Qu.: 24.00  
 Mode  :character   Mode  :character   Mode  :character   Median : 45.00  
                                                          Mean   : 42.48  
                                                          3rd Qu.: 62.00  
                                                          Max.   :100.00  
 track_album_id     track_album_name   track_album_release_date
 Length:32833       Length:32833       Length:32833            
 Class :character   Class :character   Class :character        
 Mode  :character   Mode  :character   Mode  :character        
                                                               
                                                               
                                                               
 playlist_name      playlist_id        playlist_genre     playlist_subgenre 
 Length:32833       Length:32833       Length:32833       Length:32833      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
  danceability        energy              key            loudness      
 Min.   :0.0000   Min.   :0.000175   Min.   : 0.000   Min.   :-46.448  
 1st Qu.:0.5630   1st Qu.:0.581000   1st Qu.: 2.000   1st Qu.: -8.171  
 Median :0.6720   Median :0.721000   Median : 6.000   Median : -6.166  
 Mean   :0.6548   Mean   :0.698619   Mean   : 5.374   Mean   : -6.720  
 3rd Qu.:0.7610   3rd Qu.:0.840000   3rd Qu.: 9.000   3rd Qu.: -4.645  
 Max.   :0.9830   Max.   :1.000000   Max.   :11.000   Max.   :  1.275  
      mode         speechiness      acousticness    instrumentalness   
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000000  
 1st Qu.:0.0000   1st Qu.:0.0410   1st Qu.:0.0151   1st Qu.:0.0000000  
 Median :1.0000   Median :0.0625   Median :0.0804   Median :0.0000161  
 Mean   :0.5657   Mean   :0.1071   Mean   :0.1753   Mean   :0.0847472  
 3rd Qu.:1.0000   3rd Qu.:0.1320   3rd Qu.:0.2550   3rd Qu.:0.0048300  
 Max.   :1.0000   Max.   :0.9180   Max.   :0.9940   Max.   :0.9940000  
    liveness         valence           tempo         duration_ms    
 Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  4000  
 1st Qu.:0.0927   1st Qu.:0.3310   1st Qu.: 99.96   1st Qu.:187819  
 Median :0.1270   Median :0.5120   Median :121.98   Median :216000  
 Mean   :0.1902   Mean   :0.5106   Mean   :120.88   Mean   :225800  
 3rd Qu.:0.2480   3rd Qu.:0.6930   3rd Qu.:133.92   3rd Qu.:253585  
 Max.   :0.9960   Max.   :0.9910   Max.   :239.44   Max.   :517810

Checking Structure of Data

str(spotify_songs)

spc_tbl_ [32,833 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ track_id                : chr [1:32833] "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
 $ track_name              : chr [1:32833] "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
 $ track_artist            : chr [1:32833] "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
 $ track_popularity        : num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
 $ track_album_id          : chr [1:32833] "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
 $ track_album_name        : chr [1:32833] "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
 $ track_album_release_date: chr [1:32833] "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
 $ playlist_name           : chr [1:32833] "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
 $ playlist_id             : chr [1:32833] "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
 $ playlist_genre          : chr [1:32833] "pop" "pop" "pop" "pop" ...
 $ playlist_subgenre       : chr [1:32833] "dance pop" "dance pop" "dance pop" "dance pop" ...
 $ danceability            : num [1:32833] 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
 $ energy                  : num [1:32833] 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
 $ key                     : num [1:32833] 6 11 1 7 1 8 5 4 8 2 ...
 $ loudness                : num [1:32833] -2.63 -4.97 -3.43 -3.78 -4.67 ...
 $ mode                    : num [1:32833] 1 1 0 1 1 1 0 0 1 1 ...
 $ speechiness             : num [1:32833] 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
 $ acousticness            : num [1:32833] 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
 $ instrumentalness        : num [1:32833] 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
 $ liveness                : num [1:32833] 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
 $ valence                 : num [1:32833] 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
 $ tempo                   : num [1:32833] 122 100 124 122 124 ...
 $ duration_ms             : num [1:32833] 194754 162600 176616 169093 189052 ...
 - attr(*, "spec")=
  .. cols(
  ..   track_id = col_character(),
  ..   track_name = col_character(),
  ..   track_artist = col_character(),
  ..   track_popularity = col_double(),
  ..   track_album_id = col_character(),
  ..   track_album_name = col_character(),
  ..   track_album_release_date = col_character(),
  ..   playlist_name = col_character(),
  ..   playlist_id = col_character(),
  ..   playlist_genre = col_character(),
  ..   playlist_subgenre = col_character(),
  ..   danceability = col_double(),
  ..   energy = col_double(),
  ..   key = col_double(),
  ..   loudness = col_double(),
  ..   mode = col_double(),
  ..   speechiness = col_double(),
  ..   acousticness = col_double(),
  ..   instrumentalness = col_double(),
  ..   liveness = col_double(),
  ..   valence = col_double(),
  ..   tempo = col_double(),
  ..   duration_ms = col_double()
  .. )
 - attr(*, "problems")=<externalptr>

Data Cleaning:

Checking for NULLs/Missing values

colSums(is.na(spotify_songs))

                track_id               track_name             track_artist 
                       0                        5                        5 
        track_popularity           track_album_id         track_album_name 
                       0                        0                        5 
track_album_release_date            playlist_name              playlist_id 
                       0                        0                        0 
          playlist_genre        playlist_subgenre             danceability 
                       0                        0                        0 
                  energy                      key                 loudness 
                       0                        0                        0 
                    mode              speechiness             acousticness 
                       0                        0                        0 
        instrumentalness                 liveness                  valence 
                       0                        0                        0 
                   tempo              duration_ms 
                       0                        0

Null Values: We can observe that the variables track_name, track_album_name, and track_artist have 5 instances of missing data. Out of 32833 rows, we can remove 5 rows without any significant impact on our data.

Duplicate Values: Upon examining the dataset, it became apparent that certain songs were included more than once. Specifically, out of the 32,833 songs in the dataset, only 28,352 songs are unique. These repeated songs are identified by the same ‘track_id’, but a different ‘playlist_id’. Therefore, to remove these duplicates, we will delete the rows with repeated ‘track_id’ values. Since the ‘track_id’ serves as a unique identifier for each song, and the other numeric and categorical features associated with each song remain consistent across all instances of that song, this method of deduplication will be effective.

Removing NULL from data

spotify_songs <- na.omit(spotify_songs)

Changing datatype of some categorical columns from string to factor.

spotify_songs <-spotify_songs %>%
  mutate(playlist_genre=as.factor(spotify_songs$playlist_genre),
         playlist_subgenre=as.factor(spotify_songs$playlist_subgenre),
         mode=as.factor(mode),
         key=as.factor(key))

Removing Duplicate Data

spotify_songs <- spotify_songs[!duplicated(spotify_songs$track_id),]
dim(spotify_songs)

[1] 28352    23

Dropping Redundant Columns

spotify_songs <- spotify_songs %>% select(-c(track_id, track_album_id,
                                             track_album_name, 
                                             playlist_id, playlist_name,
                                             playlist_subgenre))

Converting date to string and creating a new column ‘year’

spotify_songs$track_album_release_date <- as.character(spotify_songs$track_album_release_date, "%m/%d/%Y")
spotify_songs$year <- substr(spotify_songs$track_album_release_date,1,4)

Changing data type of year column

spotify_songs$year <- as.numeric(spotify_songs$year)

To view Cleaned Data

output_data <- head(spotify_songs, n = 100)
View(output_data)

Error in check_for_XQuartz(file.path(R.home("modules"), "R_de.so")): X11 library is missing: install XQuartz from www.xquartz.org

Exploratory Data Analysis:

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

spotify_songs %>% group_by(Genre = playlist_genre) %>%
  summarise(No_of_tracks = n()) %>% 
  arrange(desc(No_of_tracks)) %>% knitr::kable()

Genre	No_of_tracks
rap	5398
pop	5132
edm	4877
r&b	4504
rock	4305
latin	4136

ThE above code groups Spotify songs by genre and calculates the number of tracks for each genre, then arranges the genres by the number of tracks in descending order and displays them in a table using the ‘knitr’ package’s ‘kable’ function.

POPULARITY ACROSS GENRES

popularity_vs_genre_plot<- ggplot(spotify_songs, aes(x = playlist_genre, y =
                                                 track_popularity)) +
        geom_boxplot() +
        coord_flip() +
        labs(title = "Popularity across genres", x = "Genres", y = "Popularity")

ggplotly(popularity_vs_genre_plot)

The above code generates a box plot using the to display the distribution of track popularity across different genres of Spotify songs.

ARTISTS WITH MOST TRACK RELEASES

highest_tracks <- spotify_songs %>% group_by(Artist = track_artist) %>%
  summarise(No_of_tracks = n()) %>%
  arrange(desc(No_of_tracks)) %>%
  top_n(15, wt = No_of_tracks) %>% 
  ggplot(aes(x = Artist, y = No_of_tracks)) +
        geom_bar(stat = "identity") +
        coord_flip() + labs(title = "Artists With The Most Track Releases", x = "Artist", y = "# of Tracks")

ggplotly(highest_tracks)

The above code groups Spotify songs by artist and calculates the number of tracks released by each artist. It then selects the top 15 artists with the most track releases, creates a bar chart using the ‘ggplot2’ package.