Introduction

This project is about describing the spotify_songs.csv dataset from the 4th week of #tidytuesday at https://github.com/rfordatascience/tidytuesday and expoloring possible problem/questions.

I am going to demonstrate

Import data and check dimensions

# Get the Data
spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
dim(spotify_songs)
## [1] 32833    23

The dataset 32833 observations of 23 variables.

glimpse(spotify_songs)
## Observations: 32,833
## Variables: 23
## $ track_id                 <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdf…
## $ track_name               <chr> "I Don't Care (with Justin Bieber) - Loud Lu…
## $ track_artist             <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "T…
## $ track_popularity         <dbl> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, …
## $ track_album_id           <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E…
## $ track_album_name         <chr> "I Don't Care (with Justin Bieber) [Loud Lux…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "2…
## $ playlist_name            <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop …
## $ playlist_id              <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7c…
## $ playlist_genre           <chr> "pop", "pop", "pop", "pop", "pop", "pop", "p…
## $ playlist_subgenre        <chr> "dance pop", "dance pop", "dance pop", "danc…
## $ danceability             <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.…
## $ energy                   <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.…
## $ key                      <dbl> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5…
## $ loudness                 <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.3…
## $ mode                     <dbl> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,…
## $ speechiness              <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.12…
## $ acousticness             <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030,…
## $ instrumentalness         <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00…
## $ liveness                 <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.14…
## $ valence                  <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.…
## $ tempo                    <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, …
## $ duration_ms              <dbl> 194754, 162600, 176616, 169093, 189052, 1630…

Data cleaning

As a first step I am going to clean the dataset and deal with missing and extreme values

Missing values

# find NAs
df_na <- sapply(spotify_songs, function(x) sum(is.na(x)))
data.frame(df_na[df_na >0])
##                  df_na.df_na...0.
## track_name                      5
## track_artist                    5
## track_album_name                5
# remove observations with missing data
`%notin%` <- Negate(`%in%`)
spotify_songs <- data.table(spotify_songs) 
spotify_songs <- spotify_songs[track_id %notin% spotify_songs[is.na(track_name),track_id],]

There were 5 missing values for track_name, track_artist and track_album_name, which I removed.

Data exploration

Next I am doing some data exploration to get more familiar with the datset:

# Create a new dataset with unique tracks only
ids <- unique(spotify_songs$track_id) #28352
names(ids) <- ids

# add artists, track and track popularity
tracks <- spotify_songs[match(names(ids), spotify_songs$track_id),]
# plot numeric variables

numcols <- which(sapply(tracks, is.numeric))

ggplot(gather(tracks[,..numcols]), aes(value)) + 
     geom_histogram(bins = 10) + 
     facet_wrap(~key, scales = 'free_x')

# try out different themes for one plot

p <- ggplot(tracks, aes(x = energy)) + 
     geom_histogram(bins = 10) 

p1 <- p + theme_economist() + scale_fill_economist() 
p2 <- p + theme_stata() + scale_fill_stata() 
p3 <- p + theme_excel() + scale_fill_excel() 
p4 <- p + theme_wsj() + scale_fill_wsj('colors6', '') 
p5 <- p + theme_gdocs() + scale_fill_gdocs() 

theme_custom <- function() {
    theme(
        axis.text = element_text(
            family = 'Arial',
            color  = "#52854C",
            size   = 12),
        axis.title = element_text(
            family = 'Arial',
            color  = "#52854C",
            size   = 16,
            face   = "bold"),
        axis.text.y = element_text(hjust = 0.5),
        panel.background = element_rect(
            fill = "#52854C",
            color = "white",
            size = 2)
    )
}

p6 <- p + theme_custom()
p7 <- p + theme_tufte()
p8 <- p + theme_solid()
p9 <- p + theme_solarized()

#library(ggthemr)
#?ggthemr
  
grid.arrange(p1,p3,p2,p6,p8,p7, p4,p5,p9, top = paste("Themes:","\n","row 1: economist, excel, stata","\n","row 2: custom, solid, tufte","\n","row 3: wsj, gdocs, solarized"))  

# inspect factor variables about genre
tracks[, .(count =.N, avg_tempo = mean(tempo), avg_energy = mean(energy)), by = .(playlist_genre, playlist_subgenre)]
##     playlist_genre         playlist_subgenre count avg_tempo avg_energy
##  1:            pop                 dance pop  1298  120.1066  0.7421888
##  2:            pop             post-teen pop  1036  124.3547  0.7184825
##  3:            pop                electropop  1251  122.6898  0.7228383
##  4:            pop           indie poptimism  1547  118.0129  0.6371750
##  5:            rap                   hip hop  1296  118.0693  0.5647272
##  6:            rap          southern hip hop  1582  118.9214  0.6810569
##  7:            rap              gangster rap  1314  116.6103  0.6884680
##  8:            rap                      trap  1206  129.8008  0.6579403
##  9:           rock                album rock  1039  122.5159  0.6625255
## 10:           rock              classic rock  1100  123.5544  0.6975100
## 11:           rock            permanent wave   964  124.7375  0.7092049
## 12:           rock                 hard rock  1202  128.8220  0.8457180
## 13:          latin                  tropical  1158  116.9400  0.6735464
## 14:          latin                 latin pop  1097  120.1612  0.6922179
## 15:          latin                 reggaeton   687  117.6784  0.7543552
## 16:          latin             latin hip hop  1194  119.0557  0.7376173
## 17:            r&b        urban contemporary  1187  117.7820  0.5673791
## 18:            r&b                   hip pop   803  116.3243  0.6224746
## 19:            r&b            new jack swing  1036  113.0174  0.6561952
## 20:            r&b                  neo soul  1478  110.1352  0.5408695
## 21:            edm             electro house  1416  125.1971  0.8024859
## 22:            edm                  big room  1034  129.2729  0.8690493
## 23:            edm                   pop edm   967  124.8919  0.7554705
## 24:            edm progressive electro house  1460  126.2906  0.8102616
##     playlist_genre         playlist_subgenre count avg_tempo avg_energy

Questions to analyze

Which genre ist the most popular?

Tracks of which genre are using a lot of text and which are compile of more acoustic parts?

In which genre can we find the most live tracks?

# compute the average popluarity and liveness of tracks per genre,
# sorted by popularity in descending order
tracks[,.(avg_popularity = round(mean(track_popularity),2),
                 avg_speechiness = round(mean(speechiness),4),
                 avg_acousticness = round(mean(acousticness),4),
                 avg_liveness = round(mean(liveness),4)), 
              by = playlist_genre][order(-avg_popularity)]
##    playlist_genre avg_popularity avg_speechiness avg_acousticness avg_liveness
## 1:            pop          45.91          0.0742           0.1721       0.1773
## 2:            rap          41.85          0.1974           0.1966       0.1911
## 3:          latin          41.45          0.1005           0.2127       0.1817
## 4:           rock          39.69          0.0579           0.1475       0.2048
## 5:            r&b          35.93          0.1155           0.2641       0.1763
## 6:            edm          30.68          0.0879           0.0769       0.2143

The most popular genre, on average, is pop, followed by rap and latin.

Looking at speechiness, tracks that are made entirely of spoken words are close to 1, while values below 0.33 most likely represent music and other non-speech-like tracks. Within the music category rap is obviously the genre with the highest presence of spoken words in a track, while rock music seems to be rather sparing with words.

Acousticness is a confidence measure from 0.0 to 1.0 of whether the track is acoustic. Among all genres r&b shows the highest average acoustic score in this dataset.

Liveness detects the presence of an audience in the recording. edm tracks were moste likely performed live on average in this dataset.

How do different hip hop tracks compare to each other?

# MDS multi dimensial scaling 
# for 25 randomly selected tracks in subgenre hip hop

set.seed(456)
df <- tracks[playlist_subgenre=="hip pop",]
df <- df[sample(1:nrow(df), 25, replace=FALSE),]
rn <- paste(df$track_artist, df$track_name , substr(df$track_id, start = 1, stop = 3), sep = " - ")
df <- df[,..numcols]
df <- data.frame(sapply(df, function(x) scale(x)))
rownames(df) <- rn              

df_mds <- data.table(cmdscale(dist(df)),keep.rownames = TRUE)
df_mds$song <- rn

ggplot(df_mds, aes(V1, -V2, label = song)) + 
  geom_point() +
  geom_text(hjust = 0, nudge_x = 0.1, size = 3) + 
  xlim(-3, 10) + 
  theme_bw()

How does energy influece track popularity?

# ANIMATION 
# energy vs track_popularity for 1000 randomly choosen tracks.

set.seed(456)

df2 <- tracks[sample(1:nrow(tracks), 1000, replace=FALSE),]
rn <- paste(df2$track_artist, df2$track_name , substr(df2$track_id, start = 1, stop = 3), sep = " - ")
df2num <- df2[,..numcols]
df2num <- data.frame(sapply(df2num, function(x) scale(x)))
rownames(df2num) <- rn 


dm <- dist(df2num)
hc <- hclust(dm)
clusters <- dendextend::cutree(hc, 3)

df2$cluster <- factor(clusters)

ggplot(df2,aes(energy, track_popularity, color = factor(clusters))) +
  geom_point(size = 3) + 
  geom_smooth(method = "lm", se= FALSE) +
  transition_states(playlist_genre)+
  labs(colour = "Cluster",
       title = paste("{closest_state}"),
       subtitle = "Number of tracks: {nrow(subset(df2, playlist_genre == closest_state))}")+
  theme_economist() + scale_fill_economist()

What are the 10 most favoured artists in terms of their track popularity?

tracks[,.(avg_popularity = round(mean(track_popularity),2),
          tracks = .N),by=track_artist][order(-avg_popularity)][1:10]
##       track_artist avg_popularity tracks
##  1:  Trevor Daniel          97.00      1
##  2:            Y2K          91.00      1
##  3:    Don Toliver          87.50      2
##  4:           Kina          85.50      2
##  5:       JACKBOYS          84.33      3
##  6:   Dadá Boladão          84.00      1
##  7:         DaBaby          83.67      6
##  8:    Roddy Ricch          83.43      7
##  9:      Baby Keem          83.00      1
## 10: Internet Money          83.00      1

Who is Tevor Daniel?

tracks[track_artist == "Trevor Daniel", c(1,2,4,6,10)][order(track_album_name)]
##                  track_id track_name track_popularity track_album_name
## 1: 4TnjEaWOeW0eKTKIEvJyCa    Falling               97          Falling
##    playlist_genre
## 1:            pop

Is there a different energy in tracks that are conveying positiveness or negativeness?

# group valence into three groups
a <- spotify_songs[, valencecat := cut(valence, 3, labels = c("negative", "neutral", "positive"), ordered_result = TRUE )]

# boxplot + violinplot
ggplot(a, aes(valencecat, energy, color = valencecat, fill = valencecat)) +
  geom_violin(alpha = 0.3) +
  geom_boxplot(size = 0.7, width = 0.4, alpha = 0.3) + 
  theme_bw()

Valence is a measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.

I grouped the trackes into three groups based on their valence and visualised their energy distribution.

Positive tracks have on average more engergy than negative songs.

Is mode an indicator for postiveness or negativeness (valence)?

# Density chart: 
ggplot(tracks, aes(valence, fill = factor(mode))) + 
  geom_density(alpha = 0.25) + 
  theme(legend.position = 'top') +
  geom_vline(aes(xintercept = mean(valence)), color="grey") +
  facet_wrap(~playlist_genre) +
  theme_bw()

Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.

We can only observe small differences for example in latin music tracks tend to be positive (high valence) and the minor mode is used more frequently in those high valence tracks. The opposite is true for rock, where tracks with high valence (positiveness) use major more often and minor in tracks with low valence (negativeness).

Which genres are most energetic, loud, and good to dance?

# Heatmap
ggplot(melt(tracks[, .(danceability = mean(scale(danceability)),
                       energy = mean(scale(energy)),
                       loudness = mean(scale(loudness)),
                       tempo = mean(scale(tempo))),
                   by = playlist_genre], id = 'playlist_genre'),
       aes(playlist_genre, variable, fill = value)) + geom_tile() +
  scale_fill_viridis_c()

r&b tracks have the highest average beat duration, loudness and energy. edm is the genre with the highest danceability on average.

What are the most danceable songs per genre?

tracks[,c(2:3,10,12)][order(-danceability)][, head(.SD, 1), by=playlist_genre]
##    playlist_genre                                       track_name
## 1:            edm If Only I Could (feat. Steve Lucas) - Liem Remix
## 2:            pop                                     Ice Ice Baby
## 3:          latin                  Enseñame a Soñar - Original Mix
## 4:            r&b                                        Slow Down
## 5:            rap                                     Funky Friday
## 6:           rock                                         Hunnybee
##                track_artist danceability
## 1:  Fusion Groove Orchestra        0.983
## 2:              Vanilla Ice        0.979
## 3:                 DJ Goozo        0.979
## 4:               India.Arie        0.977
## 5:                     Dave        0.975
## 6: Unknown Mortal Orchestra        0.956

What are the top ten songs that are most often part of a playlists?

# list of tracks that were most often part of playlists
head(spotify_songs[,.(count = length(unique(playlist_id))),by = .(track_name, track_artist)][order(-count)],10)
##                                        track_name   track_artist count
##  1:                                     One Dance          Drake    12
##  2:                                      Señorita   Shawn Mendes    11
##  3:                            Livin' On A Prayer       Bon Jovi    11
##  4:           I Took A Pill In Ibiza - Seeb Remix    Mike Posner    10
##  5:                            Sweet Home Alabama Lynyrd Skynyrd    10
##  6:                           Sweet Child O' Mine  Guns N' Roses    10
##  7:                                 Cheap Thrills            Sia     9
##  8:                                       ROXANNE Arizona Zervas     9
##  9:             I Don't Care (with Justin Bieber)     Ed Sheeran     9
## 10: Sunflower - Spider-Man: Into the Spider-Verse    Post Malone     9

What genre is “One Dance” by “Drake”?

spotify_songs[track_name == "One Dance" & track_artist == "Drake",.(count = .N), by = .(playlist_genre, playlist_subgenre) ][order (playlist_genre)]
##    playlist_genre  playlist_subgenre count
## 1:            edm            pop edm     2
## 2:          latin      latin hip hop     2
## 3:            pop         electropop     4
## 4:            pop    indie poptimism     1
## 5:            r&b urban contemporary     1
## 6:            r&b            hip pop     1
## 7:            rap   southern hip hop     1

Show all Bon Jovi tracks by their album name

# TOOLTIP
p <- ggplot(spotify_songs[track_artist == "Bon Jovi",], 
            aes(loudness, energy,
                colour = factor(track_album_name),
                tooltip = paste('track:', track_name))) + 
  geom_point_interactive(size = 3) +
  labs(colour = "Album", x = "loudness", y = "energy") +
  theme(legend.text = element_text(size = 6)) +
  scale_color_viridis_d()

girafe(ggobj = p, options = list(
  opts_hover(css = "fill:black;"),
  opts_zoom(max = 2)))

Summary

In this project I have demonstrated various data visualisation techniques that allowed me to gain the following insights in the spotify_dataset:

Surprising to me was that edm (electronic dance music) had on average a low score for danceability and that I haven’t heard of any of the most popular artists (which could be explained by my age or insufficient exposure to current music hits).