install.packages("magrittr")
Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
Pradhakshya Dhanakumar
May 1, 2023
Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
Loading required package: ggplot2
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv', show_col_types = FALSE)
View(spotify_songs)
Error in check_for_XQuartz(file.path(R.home("modules"), "R_de.so")): X11 library is missing: install XQuartz from www.xquartz.org
track_id track_name track_artist track_popularity
Length:32833 Length:32833 Length:32833 Min. : 0.00
Class :character Class :character Class :character 1st Qu.: 24.00
Mode :character Mode :character Mode :character Median : 45.00
Mean : 42.48
3rd Qu.: 62.00
Max. :100.00
track_album_id track_album_name track_album_release_date
Length:32833 Length:32833 Length:32833
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
playlist_name playlist_id playlist_genre playlist_subgenre
Length:32833 Length:32833 Length:32833 Length:32833
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
danceability energy key loudness
Min. :0.0000 Min. :0.000175 Min. : 0.000 Min. :-46.448
1st Qu.:0.5630 1st Qu.:0.581000 1st Qu.: 2.000 1st Qu.: -8.171
Median :0.6720 Median :0.721000 Median : 6.000 Median : -6.166
Mean :0.6548 Mean :0.698619 Mean : 5.374 Mean : -6.720
3rd Qu.:0.7610 3rd Qu.:0.840000 3rd Qu.: 9.000 3rd Qu.: -4.645
Max. :0.9830 Max. :1.000000 Max. :11.000 Max. : 1.275
mode speechiness acousticness instrumentalness
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
1st Qu.:0.0000 1st Qu.:0.0410 1st Qu.:0.0151 1st Qu.:0.0000000
Median :1.0000 Median :0.0625 Median :0.0804 Median :0.0000161
Mean :0.5657 Mean :0.1071 Mean :0.1753 Mean :0.0847472
3rd Qu.:1.0000 3rd Qu.:0.1320 3rd Qu.:0.2550 3rd Qu.:0.0048300
Max. :1.0000 Max. :0.9180 Max. :0.9940 Max. :0.9940000
liveness valence tempo duration_ms
Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 4000
1st Qu.:0.0927 1st Qu.:0.3310 1st Qu.: 99.96 1st Qu.:187819
Median :0.1270 Median :0.5120 Median :121.98 Median :216000
Mean :0.1902 Mean :0.5106 Mean :120.88 Mean :225800
3rd Qu.:0.2480 3rd Qu.:0.6930 3rd Qu.:133.92 3rd Qu.:253585
Max. :0.9960 Max. :0.9910 Max. :239.44 Max. :517810
Checking Structure of Data
spc_tbl_ [32,833 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ track_id : chr [1:32833] "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
$ track_name : chr [1:32833] "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
$ track_artist : chr [1:32833] "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
$ track_popularity : num [1:32833] 66 67 70 60 69 67 62 69 68 67 ...
$ track_album_id : chr [1:32833] "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
$ track_album_name : chr [1:32833] "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
$ track_album_release_date: chr [1:32833] "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
$ playlist_name : chr [1:32833] "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
$ playlist_id : chr [1:32833] "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
$ playlist_genre : chr [1:32833] "pop" "pop" "pop" "pop" ...
$ playlist_subgenre : chr [1:32833] "dance pop" "dance pop" "dance pop" "dance pop" ...
$ danceability : num [1:32833] 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
$ energy : num [1:32833] 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
$ key : num [1:32833] 6 11 1 7 1 8 5 4 8 2 ...
$ loudness : num [1:32833] -2.63 -4.97 -3.43 -3.78 -4.67 ...
$ mode : num [1:32833] 1 1 0 1 1 1 0 0 1 1 ...
$ speechiness : num [1:32833] 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
$ acousticness : num [1:32833] 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
$ instrumentalness : num [1:32833] 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
$ liveness : num [1:32833] 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
$ valence : num [1:32833] 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
$ tempo : num [1:32833] 122 100 124 122 124 ...
$ duration_ms : num [1:32833] 194754 162600 176616 169093 189052 ...
- attr(*, "spec")=
.. cols(
.. track_id = col_character(),
.. track_name = col_character(),
.. track_artist = col_character(),
.. track_popularity = col_double(),
.. track_album_id = col_character(),
.. track_album_name = col_character(),
.. track_album_release_date = col_character(),
.. playlist_name = col_character(),
.. playlist_id = col_character(),
.. playlist_genre = col_character(),
.. playlist_subgenre = col_character(),
.. danceability = col_double(),
.. energy = col_double(),
.. key = col_double(),
.. loudness = col_double(),
.. mode = col_double(),
.. speechiness = col_double(),
.. acousticness = col_double(),
.. instrumentalness = col_double(),
.. liveness = col_double(),
.. valence = col_double(),
.. tempo = col_double(),
.. duration_ms = col_double()
.. )
- attr(*, "problems")=<externalptr>
Checking for NULLs/Missing values
track_id track_name track_artist
0 5 5
track_popularity track_album_id track_album_name
0 0 5
track_album_release_date playlist_name playlist_id
0 0 0
playlist_genre playlist_subgenre danceability
0 0 0
energy key loudness
0 0 0
mode speechiness acousticness
0 0 0
instrumentalness liveness valence
0 0 0
tempo duration_ms
0 0
Null Values: We can observe that the variables track_name, track_album_name, and track_artist have 5 instances of missing data. Out of 32833 rows, we can remove 5 rows without any significant impact on our data.
Duplicate Values: Upon examining the dataset, it became apparent that certain songs were included more than once. Specifically, out of the 32,833 songs in the dataset, only 28,352 songs are unique. These repeated songs are identified by the same ‘track_id’, but a different ‘playlist_id’. Therefore, to remove these duplicates, we will delete the rows with repeated ‘track_id’ values. Since the ‘track_id’ serves as a unique identifier for each song, and the other numeric and categorical features associated with each song remain consistent across all instances of that song, this method of deduplication will be effective.
Removing NULL from data
Changing datatype of some categorical columns from string to factor.
Removing Duplicate Data
[1] 28352 23
Dropping Redundant Columns
Converting date to string and creating a new column ‘year’
Changing data type of year column
To view Cleaned Data
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
spotify_songs %>% group_by(Genre = playlist_genre) %>%
summarise(No_of_tracks = n()) %>%
arrange(desc(No_of_tracks)) %>% knitr::kable()
Genre | No_of_tracks |
---|---|
rap | 5398 |
pop | 5132 |
edm | 4877 |
r&b | 4504 |
rock | 4305 |
latin | 4136 |
ThE above code groups Spotify songs by genre and calculates the number of tracks for each genre, then arranges the genres by the number of tracks in descending order and displays them in a table using the ‘knitr’ package’s ‘kable’ function.
POPULARITY ACROSS GENRES
popularity_vs_genre_plot<- ggplot(spotify_songs, aes(x = playlist_genre, y =
track_popularity)) +
geom_boxplot() +
coord_flip() +
labs(title = "Popularity across genres", x = "Genres", y = "Popularity")
ggplotly(popularity_vs_genre_plot)
The above code generates a box plot using the to display the distribution of track popularity across different genres of Spotify songs.
ARTISTS WITH MOST TRACK RELEASES
highest_tracks <- spotify_songs %>% group_by(Artist = track_artist) %>%
summarise(No_of_tracks = n()) %>%
arrange(desc(No_of_tracks)) %>%
top_n(15, wt = No_of_tracks) %>%
ggplot(aes(x = Artist, y = No_of_tracks)) +
geom_bar(stat = "identity") +
coord_flip() + labs(title = "Artists With The Most Track Releases", x = "Artist", y = "# of Tracks")
ggplotly(highest_tracks)
The above code groups Spotify songs by artist and calculates the number of tracks released by each artist. It then selects the top 15 artists with the most track releases, creates a bar chart using the ‘ggplot2’ package.
---
title: "Final Project Assignment#2 : Exploratory Analysis and Visualization: Project & Data Description"
author: "Pradhakshya Dhanakumar"
description: "Exploratory Analysis and Visualization"
date: "05/01/2023"
format:
html:
df-print: paged
toc: true
code-copy: true
code-tools: true
css: styles.css
categories:
- final_Project_assignment_2
- final_project_exploratory_analysis
editor_options:
chunk_output_type: console
---
```{r}
install.packages("magrittr")
```
```{r}
install.packages("plotly")
```
```{r}
install.packages("ggplot2")
```
```{r}
library(magrittr)
library(plotly)
```
## Understanding Data:
```{r}
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv', show_col_types = FALSE)
View(spotify_songs)
```
```{r}
dim(spotify_songs)
```
```{r}
tail(spotify_songs,5)
```
```{r}
summary(spotify_songs)
```
Checking Structure of Data
```{r}
str(spotify_songs)
```
## Data Cleaning:
Checking for NULLs/Missing values
```{r}
colSums(is.na(spotify_songs))
```
Null Values:
We can observe that the variables track_name, track_album_name, and track_artist have 5 instances of missing data. Out of 32833 rows, we can remove 5 rows without any significant impact on our data.
Duplicate Values:
Upon examining the dataset, it became apparent that certain songs were included more than once. Specifically, out of the 32,833 songs in the dataset, only 28,352 songs are unique. These repeated songs are identified by the same 'track_id', but a different 'playlist_id'. Therefore, to remove these duplicates, we will delete the rows with repeated 'track_id' values. Since the 'track_id' serves as a unique identifier for each song, and the other numeric and categorical features associated with each song remain consistent across all instances of that song, this method of deduplication will be effective.
Removing NULL from data
```{r}
spotify_songs <- na.omit(spotify_songs)
```
Changing datatype of some categorical columns from string to factor.
```{r}
spotify_songs <-spotify_songs %>%
mutate(playlist_genre=as.factor(spotify_songs$playlist_genre),
playlist_subgenre=as.factor(spotify_songs$playlist_subgenre),
mode=as.factor(mode),
key=as.factor(key))
```
Removing Duplicate Data
```{r}
spotify_songs <- spotify_songs[!duplicated(spotify_songs$track_id),]
dim(spotify_songs)
```
Dropping Redundant Columns
```{r}
spotify_songs <- spotify_songs %>% select(-c(track_id, track_album_id,
track_album_name,
playlist_id, playlist_name,
playlist_subgenre))
```
Converting date to string and creating a new column 'year'
```{r}
spotify_songs$track_album_release_date <- as.character(spotify_songs$track_album_release_date, "%m/%d/%Y")
spotify_songs$year <- substr(spotify_songs$track_album_release_date,1,4)
```
Changing data type of year column
```{r}
spotify_songs$year <- as.numeric(spotify_songs$year)
```
To view Cleaned Data
```{r}
output_data <- head(spotify_songs, n = 100)
View(output_data)
```
## Exploratory Data Analysis:
```{r}
library(dplyr)
```
```{r}
spotify_songs %>% group_by(Genre = playlist_genre) %>%
summarise(No_of_tracks = n()) %>%
arrange(desc(No_of_tracks)) %>% knitr::kable()
```
ThE above code groups Spotify songs by genre and calculates the number of tracks for each genre, then arranges the genres by the number of tracks in descending order and displays them in a table using the 'knitr' package's 'kable' function.
POPULARITY ACROSS GENRES
```{r}
popularity_vs_genre_plot<- ggplot(spotify_songs, aes(x = playlist_genre, y =
track_popularity)) +
geom_boxplot() +
coord_flip() +
labs(title = "Popularity across genres", x = "Genres", y = "Popularity")
ggplotly(popularity_vs_genre_plot)
```
The above code generates a box plot using the to display the distribution of track popularity across different genres of Spotify songs.
ARTISTS WITH MOST TRACK RELEASES
```{r}
highest_tracks <- spotify_songs %>% group_by(Artist = track_artist) %>%
summarise(No_of_tracks = n()) %>%
arrange(desc(No_of_tracks)) %>%
top_n(15, wt = No_of_tracks) %>%
ggplot(aes(x = Artist, y = No_of_tracks)) +
geom_bar(stat = "identity") +
coord_flip() + labs(title = "Artists With The Most Track Releases", x = "Artist", y = "# of Tracks")
ggplotly(highest_tracks)
```
The above code groups Spotify songs by artist and calculates the number of tracks released by each artist. It then selects the top 15 artists with the most track releases, creates a bar chart using the 'ggplot2' package.