# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Imports for web scraping
import requests
from bs4 import BeautifulSoup


top100 = pd.DataFrame()

# General form of URL
url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_'
for year in range(1960, 2021):
    # Get HTML from URL and parse
    r = requests.get(url + str(year))
    html = r.content
    root = BeautifulSoup(r.content, "html.parser")

    # Search for a table with the class "wiki table" and read into dataframe
    r = root.find("table", "wikitable")
    pretty = r.prettify()
    df = pd.read_html(pretty)[0]

    # Rename columns and add a column for the year
    df.columns = ['Rank', 'Song', 'Artist(s)']
    df = df.assign(Year = year)

    # Add dataframe to top100
    top100 = top100.append(df)

display(top100.head())
top100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6101 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Rank       6101 non-null   object
 1   Song       6101 non-null   object
 2   Artist(s)  6101 non-null   object
 3   Year       6101 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 238.3+ KB


# Reformat rank to be integers
top100.loc[top100['Rank'] == 'Tie', 'Rank'] = 100
top100 = top100.astype({'Rank': 'int64'})

# Remove quotations from song titles
top100["Song"] = top100["Song"].str.strip('"').str.strip()
top100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6101 entries, 0 to 99
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Rank       6101 non-null   int64 
 1   Song       6101 non-null   object
 2   Artist(s)  6101 non-null   object
 3   Year       6101 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 238.3+ KB


# Export DataFrame
top100.to_csv('../data/RawYearlyTop100.csv', index=False)


# Imports for Spotify searching
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json


# Read in data from CSV
top100 = pd.read_csv('../data/RawYearlyTop100.csv')


# Choosing the first artist
top100["Artist(s)"] = top100["Artist(s)"].str.split('featuring', expand=True)[0]
top100["Artist(s)"] = top100["Artist(s)"].str.split('and', expand=True)[0]
top100["Artist(s)"] = top100["Artist(s)"].str.split('&', expand=True)[0]
top100["Artist(s)"] = top100["Artist(s)"].str.split(',', expand=True)[0]

# Cleaning up the songs
top100["Song"] = top100["Song"].str.split("/", expand=True)[0]
top100["Song"] = top100["Song"].str.replace("'", " ")


# Create query column 
top100["Query"] = "track:" + top100["Song"] + " artist:" + top100["Artist(s)"]
top100.head()


# Create python object from json file
with open('../config.json') as file:
    app_data = json.load(file)

# Authenticate with Spotipy
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials( \
                     client_id=app_data['client_id'], \
                     client_secret=app_data['client_secret']))


# Adds the desired data in track to the row object
def add_data_to_row(row, track):
    row['ID'] = track['id']
    row['Popularity'] = track['popularity']
    row['Release Date'] = track['album']['release_date']

    # Create a list of the artists
    artists = []
    for i, artist in enumerate(track['artists']):
        artists.append(artist['id'])
    row['Artist IDs'] = artists
    
    row['Album ID'] = track['album']['id']
    return row


# Initialize new DataFrame
new_columns = ["Year", "Rank", "ID", "Song", "Artist(s)", "Artist IDs", "Album ID", "Release Date", "Popularity", "Query"]
results = pd.DataFrame(columns=new_columns)
no_results = pd.DataFrame(columns = ["Year", "Rank", "Song", "Artist(s)", "Query"])

# Iterate over every row in top100
for i, row in top100.iterrows():

    # Make search with specific query
    search_res = sp.search(q=row['Query'], limit=5)
    tracks = search_res['tracks']['items']

    # If there are no results:
    if len(tracks) == 0:
        # Create a new query and search again
        new_query = row["Artist(s)"] + row["Song"]
        search2_res = sp.search(q=new_query, limit=5)
        tracks2 = search2_res['tracks']['items']

        # If there are still no results, add to "no_results"
        if len(tracks2) == 0:
            no_results = no_results.append(row, ignore_index=True)
        
        else:
            # For every result, add to results DataFrame
            for idx, track in enumerate(tracks):
                row = add_data_to_row(row, track)
                results = results.append(row, ignore_index=True)
    else:
        # For every result, add to results DataFrame
        for idx, track in enumerate(tracks):
            row = add_data_to_row(row, track)
            results = results.append(row, ignore_index=True)

# Save results, and list of missing tracks to csv files
results.to_csv('../data/search-results/all.csv', index=False)
no_results.to_csv('../data/search-results/all_missing.csv', index=False)


# Load data from Spotify search results
results = pd.read_csv('../data/search-results/all.csv')

# Format release date as datetime object
results.loc[results['Release Date'] == "0000",'Release Date'] = None
results['Release Date'] = pd.to_datetime(results['Release Date'])


# Create new dataframe of the oldest result for each song
selected = pd.DataFrame()

# Select the oldest result of each song
for q, song in results.groupby(['Query','Year']):
    oldest = song.iloc[0]
    for i, result in song.iterrows():
        if (result['Release Date']) < (oldest['Release Date']):
            oldest = result
    selected = selected.append(oldest, ignore_index=True)

# Results
selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6070 entries, 0 to 6069
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Album ID      6070 non-null   object        
 1   Artist IDs    6070 non-null   object        
 2   Artist(s)     6070 non-null   object        
 3   ID            6070 non-null   object        
 4   Popularity    6070 non-null   float64       
 5   Query         6070 non-null   object        
 6   Rank          6070 non-null   float64       
 7   Release Date  6070 non-null   datetime64[ns]
 8   Song          6070 non-null   object        
 9   Year          6070 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(6)
memory usage: 474.3+ KB


# Export results
selected.to_csv('../data/search-results/all_selected.csv', index=False)


# Result statistics
differences = (selected['Release Date'].dt.year - (selected['Year']))
differences.describe()

count    6070.000000
mean        2.126359
std         9.934072
min       -87.000000
25%        -1.000000
50%         0.000000
75%         0.000000
max        59.000000
dtype: float64


# Uses the same imports as spotify ID search

# Load credentials
with open('../config.json') as file:
    app_data = json.load(file)

# Authenticate
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials( \
                     client_id=app_data['client_id'], \
                     client_secret=app_data['client_secret']))


# Load data for selected tracks
all_years = pd.read_csv('../data/search-results/all_selected.csv')

# Remove problematic row
delete_row = all_years[(all_years["Year"]==2005.0) & (all_years["Rank"] == 80)].index
all_years = all_years.drop(delete_row)


# Create dataframe with the added audio features
all_audio_features = pd.DataFrame()

for rank_val, rank in all_years.groupby('Rank'):

    # Get dataframe of audio features for this rank
    audio_features = pd.DataFrame (sp.audio_features(tracks=rank['ID']))

    # Combine new information with existing data
    rank_and_features = rank.merge(audio_features, left_on='ID', right_on='id', how='inner')
    rank_and_features = rank_and_features.drop(columns=['track_href','type','uri','analysis_url','id'])
    all_audio_features = all_audio_features.append(rank_and_features)

# Display characteristics of result
all_audio_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6069 entries, 0 to 61
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Album ID          6069 non-null   object 
 1   Artist IDs        6069 non-null   object 
 2   Artist(s)         6069 non-null   object 
 3   ID                6069 non-null   object 
 4   Popularity        6069 non-null   float64
 5   Query             6069 non-null   object 
 6   Rank              6069 non-null   float64
 7   Release Date      6069 non-null   object 
 8   Song              6069 non-null   object 
 9   Year              6069 non-null   float64
 10  danceability      6069 non-null   float64
 11  energy            6069 non-null   float64
 12  key               6069 non-null   int64  
 13  loudness          6069 non-null   float64
 14  mode              6069 non-null   int64  
 15  speechiness       6069 non-null   float64
 16  acousticness      6069 non-null   float64
 17  instrumentalness  6069 non-null   float64
 18  liveness          6069 non-null   float64
 19  valence           6069 non-null   float64
 20  tempo             6069 non-null   float64
 21  duration_ms       6069 non-null   int64  
 22  time_signature    6069 non-null   int64  
dtypes: float64(12), int64(4), object(7)
memory usage: 1.1+ MB


# Export results
all_audio_features.to_csv('../data/Billboard_Audio_Features.csv', index=False)


# Uses the same imports as spotify ID search

# Load credentials
with open('../config.json') as file:
    app_data = json.load(file)

# Authenticate
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials( \
                     client_id=app_data['client_id'], \
                     client_secret=app_data['client_secret']))


# Load data with audio features
all_years = pd.read_csv('../data/Billboard_Audio_Features.csv')

# Convert strings of artist IDs into lists of artist IDs and select the first artist
all_years['Artist IDs'] = all_years['Artist IDs'].str.lstrip('[').str.rstrip(']')
all_years['Primary Artist ID'] = all_years['Artist IDs'].str.split(',', expand = True)[0].str.strip("'")

# Create column for genres
all_years['Genres'] = pd.Series(dtype=object)

# Create empty dataframe for results
all_with_genres = pd.DataFrame(columns=all_years.columns)


# Iterate through rows and lookup the primary artist's genres
for i, row in all_years.iterrows():
    artist = sp.artist(row['Primary Artist ID'])
    row['Genres'] = ", ".join(artist["genres"])
    all_with_genres = all_with_genres.append(row, ignore_index= True)


# Export data
all_with_genres.to_csv('../data/Billboard_Audio_Features.csv', index=False)


# Read in data and drop songs with no genre found
data = pd.read_csv('../data/Billboard_Audio_Features.csv')
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5424 entries, 0 to 5745
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Album ID           5424 non-null   object 
 1   Artist IDs         5424 non-null   object 
 2   Artist(s)          5424 non-null   object 
 3   ID                 5424 non-null   object 
 4   Popularity         5424 non-null   float64
 5   Query              5424 non-null   object 
 6   Rank               5424 non-null   float64
 7   Release Date       5424 non-null   object 
 8   Song               5424 non-null   object 
 9   Year               5424 non-null   float64
 10  danceability       5424 non-null   float64
 11  energy             5424 non-null   float64
 12  key                5424 non-null   int64  
 13  loudness           5424 non-null   float64
 14  mode               5424 non-null   int64  
 15  speechiness        5424 non-null   float64
 16  acousticness       5424 non-null   float64
 17  instrumentalness   5424 non-null   float64
 18  liveness           5424 non-null   float64
 19  valence            5424 non-null   float64
 20  tempo              5424 non-null   float64
 21  duration_ms        5424 non-null   int64  
 22  time_signature     5424 non-null   int64  
 23  Primary Artist ID  5424 non-null   object 
 24  Genres             5424 non-null   object 
 25  Top Genre          5424 non-null   object 
dtypes: float64(12), int64(4), object(10)
memory usage: 1.1+ MB


# Output the counts of full genre strings
data['Genres'].value_counts()[:25]

dance pop, pop, post-teen pop                                                              104
dance pop, pop                                                                              61
contemporary country, country, country road                                                 52
dance pop, pop, urban contemporary                                                          51
bubblegum pop                                                                               43
barbadian pop, dance pop, pop, pop rap, post-teen pop, urban contemporary                   32
soft rock                                                                                   30
pop, post-teen pop                                                                          28
canadian hip hop, canadian pop, hip hop, pop rap, rap, toronto rap                          28
glam rock, mellow gold, piano rock, soft rock                                               27
dance pop, hip pop, neo soul, new jack swing, r&b, urban contemporary                       26
beatlesque, british invasion, classic rock, merseybeat, psychedelic rock, rock              26
pop, r&b, soul                                                                              25
pop, pop rock                                                                               24
adult standards, funk, indie r&b, motown, quiet storm, soul                                 23
atl hip hop, dance pop, pop, r&b, south carolina hip hop, urban contemporary                23
dance pop, pop, pop rap, post-teen pop                                                      22
contemporary country, country, country road, modern country rock                            21
adult standards, mellow gold, soft rock                                                     21
dance pop, pop, pop rap, r&b, rap                                                           20
boy band, dance pop                                                                         20
dance pop, pop, post-teen pop, r&b                                                          20
disco                                                                                       20
album rock, art rock, beatlesque, classic rock, folk rock, mellow gold, rock, soft rock     19
adult standards, disco, funk, mellow gold, motown, quiet storm, soft rock, soul             19
Name: Genres, dtype: int64


# Empty dataframe with same columns
new_data = pd.DataFrame(columns=data.columns)

# Iterate through rows and place songs into genre categories
for i, row in data.iterrows():
    # Set to NaN if no appropriate genre found
    genre = np.nan

    # Take genres string
    genres = row["Genres"]

    # If contains keyword, place in genre
    # For pop/rock, count occurences and place into genre with most
    if "country" in genres:
        genre = "country"
    elif "disco" in genres:
        genre = "disco"
    elif "edm" in genres or "electro" in genres:
        genre = "edm"
    elif "r&b" in genres or "soul" in genres:
        genre = "soul/r&b"
    elif "hip hop" in genres or "rap" in genres:
        genre = "hip hop/rap"
    elif "alternative" in genres or "indie" in genres:
        genre = "alternative/indie"
    elif "rock" in genres or "pop" in genres:
        rock = 0
        pop = 0
        for g in genres.split(','):
            if "rock" in g:
                rock += 1
            if "pop" in g:
                pop += 1
        if rock >= pop:
            genre = "rock"
        else:
            genre = "pop"
    # Set row to chosen genre and append
    row["Top Genre"] = genre
    new_data = new_data.append(row)
    
# Counts of each genre
new_data['Top Genre'].value_counts()

rock                 1290
pop                  1063
soul/r&b              957
hip hop/rap           771
country               603
disco                 426
edm                   159
alternative/indie     155
Name: Top Genre, dtype: int64


# Export data
new_data.to_csv('../data/Billboard_Audio_Features.csv', index=False)


# Imports
import seaborn as sn


# Load complete dataset
data = pd.read_csv('../data/Billboard_Audio_Features.csv')

# Drop irrelevant columns for 
data = data.drop(columns=['Album ID', 'Artist IDs', 'Query', 'Song', 'Artist(s)'])

# Drop songs with no genre
data = data.dropna()

# Display
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5424 entries, 0 to 5745
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 5424 non-null   object 
 1   Popularity         5424 non-null   float64
 2   Rank               5424 non-null   float64
 3   Release Date       5424 non-null   object 
 4   Year               5424 non-null   float64
 5   danceability       5424 non-null   float64
 6   energy             5424 non-null   float64
 7   key                5424 non-null   int64  
 8   loudness           5424 non-null   float64
 9   mode               5424 non-null   int64  
 10  speechiness        5424 non-null   float64
 11  acousticness       5424 non-null   float64
 12  instrumentalness   5424 non-null   float64
 13  liveness           5424 non-null   float64
 14  valence            5424 non-null   float64
 15  tempo              5424 non-null   float64
 16  duration_ms        5424 non-null   int64  
 17  time_signature     5424 non-null   int64  
 18  Primary Artist ID  5424 non-null   object 
 19  Genres             5424 non-null   object 
 20  Top Genre          5424 non-null   object 
dtypes: float64(12), int64(4), object(5)
memory usage: 932.2+ KB


# Create a dataframe of the average of each feature per year
avgs = data.groupby("Year").mean().reset_index()

# Graph the relationship of tempo over time
ax = avgs.plot(x="Year", y="tempo")

# Labels
ax.set_ylabel('Tempo (bpm)')
ax.set_title('Tempo Vs. Time')
ax.get_legend().remove()

# Display
display()


# Graph the relationship of loudness over time
ax = avgs.plot(x="Year", y="loudness")

# Labels
ax.set_ylabel('Loudness (dB)')
ax.set_title('Loudness Vs. Time')
ax.get_legend().remove()

# Display
display()


# Convert duration ms to seconds
avgs['duration_ms'] = avgs['duration_ms']/1000

# Graph the relationship of duration over time
ax = avgs.plot(x="Year", y="duration_ms")

# Labels
ax.set_ylabel('Duration (sec)')
ax.set_title('Duration Vs. Time')
ax.get_legend().remove()

# Display
display()


# Rescale average popularity to match other features
avgs['Popularity'] = avgs['Popularity'] / 100

# Array of features with 0.0 to 1.0 scale
features = ['danceability','energy','speechiness','acousticness','liveness','valence', 'instrumentalness']

# Create initial plot of popularity over time
ax = avgs.plot(x="Year", y='Popularity', figsize=(15,10))

# Add plots of other features to the plot
for feature in features:
    avgs.plot(x="Year", y=feature, ax=ax)

# Label and display
ax.set_ylabel('Proportion')
ax.set_title('Features Vs. Time')
ax.legend(loc=(1.005,0.7))
display()


# Create 2D array of correlations and plot
corrs = data.drop(['Rank'], axis=1).corr()
sn.heatmap(corrs)

# Label and display
plt.title('Correlation of Features')
plt.show()


# Create dataframe for genre percent by year
genre_percent = pd.DataFrame()

# Get percentage of year that is each genre type
for y, rows in data.groupby('Year'):
    total = len(rows)
    # Create and populate row to be added
    new_row = {}
    new_row['Year'] = y
    for g, count in rows['Top Genre'].value_counts().iteritems():
        new_row[g] = count/total
    # Add row
    genre_percent = genre_percent.append(new_row, ignore_index=True)

# Replace empty genre percentages with 0%
genre_percent = genre_percent.replace(np.nan, 0)


# Reformat genre_percent dataframe for plot
genre_columns = genre_percent.drop('Year', axis=1).transpose().to_numpy()

# Create plot showing genre division over time
plt.stackplot(genre_percent['Year'],genre_columns)

# Label and display
plt.legend(genre_percent.drop('Year', axis=1).columns,loc=(1.005,0.5))
plt.xlim(1960,2020)
plt.ylim(0,1)
plt.xlabel('Year')
plt.ylabel('Proportion')
plt.title('Genre Proportion Over Time')
display()


# Create a dataframe of the average of each feature per genre
data.groupby('Top Genre').mean()[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']]


# Get list of genres
genres = data['Top Genre'].unique()

# Create violin plot of each feature for each genre
for feature in ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']:
    # Create array of all values of the feature for each genre
    feature_by_genre = [data.loc[data["Top Genre"] == genre][feature].values for genre in genres]

    # Create plot
    plt.figure(figsize=(12, 4))
    plt.violinplot(feature_by_genre, range(0, len(genres)), showmeans=True)

    # Label
    plt.xticks(range(0, len(genres)), labels=genres)
    plt.title(f'{feature[0].upper() + feature[1:]} Distribution Per Genre')
    plt.xlabel("Genre")
    plt.ylabel(feature[0].upper() + feature[1:])


# Read in the Billboard data with genres and drop empty rows
data = pd.read_csv('../data/Billboard_Audio_Features.csv')
data = data.dropna()
data.head()


# Obtain the list of features we might want to use
feature_list = list(data.columns.values)
feature_list = feature_list[10:22]
feature_list

['danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_ms']


import scipy
from scipy import stats

# The list of genres
genre_list = ['hip hop/rap','alternative/indie','country','disco','edm',
             'pop','rock','soul/r&b']

# Run Kruskal-Wallis for each feature
for feature in feature_list:
    genre_frames = []
    
    # Create a frame for each genre, and include it in the test as a parameter
    for genre in genre_list:
        genre_frames.append(data[data['Top Genre'] == genre][feature])
        
    t, p = scipy.stats.kruskal(genre_frames[0],genre_frames[1],
                              genre_frames[2],genre_frames[3],
                              genre_frames[4],genre_frames[5],
                              genre_frames[6],genre_frames[7])
    print(feature + ": t: " + str(t) + " p-value: " + str(p))
    print("")

danceability: t: 892.0653791458489 p-value: 2.481648369683846e-188

energy: t: 282.5551765302803 p-value: 3.200577495327485e-57

key: t: 22.058580524622368 p-value: 0.0024820045783447127

loudness: t: 846.8851729523643 p-value: 1.4099281667606597e-178

mode: t: 244.82240105668825 p-value: 3.502312772938618e-49

speechiness: t: 1336.3904842714205 p-value: 2.2325501776921572e-284

acousticness: t: 496.36764795999653 p-value: 4.840288930133197e-103

instrumentalness: t: 289.16510404008136 p-value: 1.244030697270327e-58

liveness: t: 34.83952318519514 p-value: 1.1988445159940325e-05

valence: t: 133.88666962789867 p-value: 9.679373519097457e-26

tempo: t: 72.6032348114743 p-value: 4.3927937644712003e-13

duration_ms: t: 171.90173191784524 p-value: 9.970796947188792e-34


# X is the list of inputs to our model, each input is the list of audio features for each observation
# Y is the list of true outputs, each output is the genre for each observation
X = data[feature_list].values
Y = data['Top Genre'].values


from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Splits the data into training and testing sets
# We stratify the sets so the sets of a proportionate amount of each genre
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    stratify=Y, 
                                                    test_size=0.30)

# Create LDA and fit it to the training data
lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, Y_train);

# Make predictions with the trained model
Y_pred = lda_clf.predict(X_test)

# Print metrics and confusion matrix
print(metrics.classification_report(Y_test,Y_pred))

plot_confusion_matrix(lda_clf, X_test, Y_test, xticks_rotation='vertical')
plt.title('LDA Confusion Matrix')
plt.show()

E:\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
                   precision    recall  f1-score   support

alternative/indie       0.09      0.02      0.03        47
          country       0.00      0.00      0.00       181
            disco       0.48      0.17      0.25       128
              edm       0.00      0.00      0.00        48
      hip hop/rap       0.60      0.57      0.58       231
              pop       0.27      0.33      0.30       319
             rock       0.40      0.72      0.52       387
         soul/r&b       0.35      0.34      0.35       287

         accuracy                           0.39      1628
        macro avg       0.27      0.27      0.25      1628
     weighted avg       0.34      0.39      0.35      1628


# Drop the genres we are no longer looking at
data = data[data['Top Genre'] != "country"]
data = data[data['Top Genre'] != "edm"]
data = data[data['Top Genre'] != "disco"]
data = data[data['Top Genre'] != "alternative/indie"]

# Obtain inputs and outputs
X = data[feature_list].values
Y = data['Top Genre'].values

# Split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    stratify=Y, 
                                                    test_size=0.30)
# Create and fit LDA
lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, Y_train);

# Predict on test set
Y_pred = lda_clf.predict(X_test)

# Print metrics and confusion matrix
print(metrics.classification_report(Y_test,Y_pred))

plot_confusion_matrix(lda_clf, X_test, Y_test)
plt.title('LDA Confusion Matrix')
plt.show()

              precision    recall  f1-score   support

 hip hop/rap       0.71      0.63      0.67       232
         pop       0.40      0.34      0.37       319
        rock       0.56      0.71      0.62       387
    soul/r&b       0.43      0.38      0.41       287

    accuracy                           0.52      1225
   macro avg       0.53      0.52      0.52      1225
weighted avg       0.52      0.52      0.52      1225


from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# Remove the features we are no longer using
feature_list = list(data.columns.values)
feature_list = feature_list[10:22]
feature_list.remove("liveness")
feature_list.remove("valence")
feature_list.remove("acousticness")
feature_list.remove("tempo")
feature_list.remove("key")

# List of genres we are looking at
genre_list = ['hip hop/rap',
             'pop','rock','soul/r&b']

# List of inputs and outputs
X = data[feature_list].values
Y = data['Top Genre'].values

# Create and fit a decision tree classifier
dt_clf = DecisionTreeClassifier(max_depth=3)
model = dt_clf.fit(X, Y)

# Plot the visualization of the decision tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt_clf, 
                   feature_names=feature_list,
                   class_names=genre_list,
                   filled=True)
plt.title('Genre Classification Decision Tree',fontdict={'fontsize':20})
plt.show()


# Split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    stratify=Y, 
                                                    test_size=0.30)

# Create and fit classifier
dt_clf = DecisionTreeClassifier(max_depth=3)
model = dt_clf.fit(X_train, Y_train)

# Predict using fitted model
Y_pred = dt_clf.predict(X_test)

# Print metrics and confusion matrix
print(metrics.classification_report(Y_test,Y_pred))

plot_confusion_matrix(dt_clf, X_test, Y_test)
plt.title('Decision Tree Confusion Matrix')
plt.show()

              precision    recall  f1-score   support

 hip hop/rap       0.64      0.56      0.60       232
         pop       0.36      0.23      0.28       319
        rock       0.50      0.61      0.55       387
    soul/r&b       0.34      0.40      0.37       287

    accuracy                           0.45      1225
   macro avg       0.46      0.45      0.45      1225
weighted avg       0.45      0.45      0.45      1225


from sklearn.ensemble import RandomForestClassifier

# Split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    stratify=Y, 
                                                    test_size=0.30)

# Create and fit random forest classifier
rfc_clf = RandomForestClassifier(max_depth=3)
model = rfc_clf.fit(X_train, Y_train)

# Predict using trained model
Y_pred = rfc_clf.predict(X_test)

# Print metrics and confusion matrix
print(metrics.classification_report(Y_test,Y_pred))

plot_confusion_matrix(rfc_clf, X_test, Y_test)
plt.title('Random Forest Confusion Matrix')
plt.show()

              precision    recall  f1-score   support

 hip hop/rap       0.60      0.62      0.61       232
         pop       0.39      0.30      0.34       319
        rock       0.51      0.83      0.63       387
    soul/r&b       0.51      0.18      0.27       287

    accuracy                           0.50      1225
   macro avg       0.50      0.48      0.46      1225
weighted avg       0.50      0.50      0.47      1225


from sklearn.model_selection import cross_val_score

# Create LDA and run 10-fold cross validation
lda_clf = LinearDiscriminantAnalysis()
lda_scores = cross_val_score(lda_clf, X, Y, cv=10, scoring='f1_macro')

# Create random forest and run 10-fold cross validation
rfc_clf = RandomForestClassifier(max_depth=3)
rfc_scores = cross_val_score(rfc_clf, X, Y, cv=10, scoring='f1_macro')

# Create decision tree and run 10-fold cross validation
dt_clf = DecisionTreeClassifier(max_depth=3)
dt_scores = cross_val_score(dt_clf, X, Y, cv=10, scoring='f1_macro')


# Print average scores and standard error
print(f"Average LDA f1 score:{lda_scores.mean()}\tStandard Error:{lda_scores.std()}\n")
print(f"Average RFC f1 score:{rfc_scores.mean()} \tStandard Error:{rfc_scores.std()}\n")
print(f"Average DT f1 score:{dt_scores.mean()}  \tStandard Error:{dt_scores.std()}\n")

Average LDA f1 score:0.48604832593261077	Standard Error:0.03529134309428848

Average RFC f1 score:0.4481556873587225 	Standard Error:0.017199063324212588

Average DT f1 score:0.3885433942651619  	Standard Error:0.04012772693405444


# Run Kruskal-Wallis on the three score lists
t, p = scipy.stats.kruskal(lda_scores,rfc_scores,dt_scores)
print(f"t: {t}\tp-value: {p}")

t: 17.77806451612902	p-value: 0.00013789303830809166

	Rank	Song	Artist(s)	Year
0	1	" Theme from A Summer Place "	Percy Faith	1960
1	2	" He'll Have to Go "	Jim Reeves	1960
2	3	" Cathy's Clown "	The Everly Brothers	1960
3	4	" Running Bear "	Johnny Preston	1960
4	5	" Teen Angel "	Mark Dinning	1960

	Rank	Song	Artist(s)	Year	Query
0	1	Theme from A Summer Place	Percy Faith	1960	track:Theme from A Summer Place artist:Percy F...
1	2	He ll Have to Go	Jim Reeves	1960	track:He ll Have to Go artist:Jim Reeves
2	3	Cathy s Clown	The Everly Brothers	1960	track:Cathy s Clown artist:The Everly Brothers
3	4	Running Bear	Johnny Preston	1960	track:Running Bear artist:Johnny Preston
4	5	Teen Angel	Mark Dinning	1960	track:Teen Angel artist:Mark Dinning

	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms
Top Genre
alternative/indie	0.542510	0.741785	4.806452	-6.095174	0.774194	0.043202	0.104419	0.022460	0.173363	0.528571	124.011626	243029.658065
country	0.576854	0.596515	5.316750	-9.108643	0.854063	0.043375	0.300146	0.012771	0.176951	0.609237	121.208128	225771.673300
disco	0.650005	0.586485	5.401408	-10.640622	0.629108	0.049814	0.293952	0.042251	0.186396	0.690267	118.331310	270524.835681
edm	0.671069	0.698082	5.364780	-5.749308	0.591195	0.100986	0.142479	0.027229	0.160148	0.526826	119.645428	220808.044025
hip hop/rap	0.734201	0.679448	5.654994	-6.289724	0.573281	0.171719	0.116275	0.013333	0.201890	0.558777	118.736595	239301.067445
pop	0.624875	0.646237	5.233302	-7.744183	0.688617	0.057938	0.261869	0.024659	0.175172	0.602963	118.965252	227133.331138
rock	0.560775	0.612283	5.083721	-9.424492	0.788372	0.045340	0.295345	0.028425	0.184704	0.607356	122.029451	231397.483721
soul/r&b	0.664520	0.573771	5.560084	-8.615846	0.594566	0.079703	0.273966	0.022854	0.172080	0.618500	114.954679	244294.026123

	Album ID	Artist IDs	Artist(s)	ID	Popularity	Query	Rank	Release Date	Song	Year	...	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature	Primary Artist ID	Genres	Top Genre
0	3bbdZdZQtCtnWEN5zG7Eyj	'3Z02hBLubJxuFJfhacLSDc'	Bryan Adams	5HQ6AUDMbMuwktvcNPuV1y	36.0	track:(Everything I Do) I Do It for You artist...	1.0	1991-01-01	(Everything I Do) I Do It for You	1991.0	...	0.06480	0.000013	0.0550	0.273	131.189	398027	4	3Z02hBLubJxuFJfhacLSDc	album rock, canadian pop, canadian singer-song...	rock
1	4zhigAhPwqp43XVHBiVeQI	'7xTcuBOIAAIGDOSvwYFPzk'	Daniel Powter	0mUyMawtxj1CJ76kn9gIZK	75.0	track:Bad Day artist:Daniel Powter	1.0	2005-02-22	Bad Day	2006.0	...	0.44800	0.003360	0.1510	0.520	140.046	233640	4	7xTcuBOIAAIGDOSvwYFPzk	canadian pop, neo mellow, pop rock	pop
2	0jZfbz0dNfDjPSg0hYJNth	'72OaDtakiy6yFqkt4TsiFt'	Cher	2goLsvvODILDzeeiT4dAoR	76.0	track:Believe artist:Cher	1.0	1998-01-01	Believe	1999.0	...	0.00820	0.000000	0.0509	0.459	132.975	239027	4	72OaDtakiy6yFqkt4TsiFt	dance pop, hollywood, new wave pop, pop, pop rock	pop
3	2Jmuuw5ff8gAGAP6B1yKKJ	'5PN2aHIvLEM98XIorsPMhE'	Kim Carnes	0kPeymTUiaidv48eRrMISu	64.0	track:Bette Davis Eyes artist:Kim Carnes	1.0	2007-01-01	Bette Davis Eyes	1981.0	...	0.01740	0.000000	0.1980	0.560	116.685	224693	4	5PN2aHIvLEM98XIorsPMhE	new wave pop, soft rock	rock
4	4yP0hdKOZPNshxUOjY0cZj	'1Xyo4u8uXC1ZmMpatF05PJ'	The Weeknd	0VjIjW4GlUZAMYd2vXMi3b	94.0	track:Blinding Lights artist:The Weeknd	1.0	2020-03-20	Blinding Lights	2020.0	...	0.00146	0.000095	0.0897	0.334	171.005	200040	4	1Xyo4u8uXC1ZmMpatF05PJ	canadian contemporary r&b, canadian pop, pop	soul/r&b

Trends and Classification of Popular Music from 1960-2020¶

Outline¶

Introduction¶

Part 1: Gathering Data¶

Part 1.1: Billboard Top 100 Web Scraping¶

Part 1.2: Spotify Track ID Collection¶

Part 1.3: ID Selection¶

Part 1.4: Get Audio Features¶

Part 1.5: Get Genre¶

Part 2: Exploratory Data Analysis¶

Part 2.1: Exploration of Features¶

Part 2.2: Exploration of Genre¶

Part 3: Hypothesis Testing & Machine Learning¶

Part 4: Conclusion¶