-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_feature_update.py
269 lines (224 loc) · 11.1 KB
/
get_feature_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#import libraries
import requests
import pandas as pd
import numpy as np
import sys
sys.path.append('/Users/minhnguyen/IronHack2023-2024/Bootcamp/')
from config_2 import *
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep
#Initialize SpotiPy with user credentias #
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=Client_ID, client_secret=Client_Secret))
# BLOCK A: FUNCTION TO SEARCH SPOTIFY ID FOR A SONG WITH SONG TITLE AND ARTIST
# function search song:
# results = sp.search(q="track:'+Great Gatsby+' artist:'+Rod Wave'", limit=1)
def search_song(title:str, artist:str = None, limit:int = 1) ->str:
"""
Searches for a song on Spotify based on the given title and optional artist.
Parameters:
- title (str): The title of the song to search for.
- artist (str, optional): The artist of the song. If provided, the search is refined
to match both the title and artist.
- limit (int, optional): The maximum number of search results to retrieve. Default is 1.
Returns:
- str: The Spotify ID of the first matching song found in the search results.
Note:
- The function uses the Spotify API to perform the search.
- If no match is found, an IndexError may occur. It is advisable to handle such cases
when using this function.
"""
if artist == '':
try:
result=sp.search(q=f"track:{title}", limit=limit)
song_id = result['tracks']['items'][0]['id']
except:
print('Unexpected error occurs')
else:
try:
result=sp.search(q=f"track:{title} artist:{artist}", limit=limit)
song_id = result['tracks']['items'][0]['id']
except:
print('Unexpected error occurs')
return song_id
def song_info_spotify(title:str, artist:str ='', limit:int = 1):
track_id_list = []
track_name_list = []
artist_name_list = []
popularity_list = []
if artist == '':
try:
result = sp.search(q=f"track:{title}", limit=limit)
for i in range(0,limit):
track_id = result['tracks']['items'][i]['id']
track_id_list.append(track_id)
track_name = result['tracks']['items'][i]['name']
track_name_list.append(track_name)
artist_name = result['tracks']['items'][i]['artists'][0]['name']
artist_name_list.append(artist_name)
popularity = result['tracks']['items'][i]['popularity']
popularity_list.append(popularity)
except:
print('Song not found')
track_id_list.append('None')
track_name_list.append('None')
href_list.append('None')
artist_name_list.append("None")
popularity_list.append('None')
else:
try:
result = sp.search(q=f"track:{title} artist:{artist}", limit=limit)
for i in range(0,limit):
track_id = result['tracks']['items'][i]['id']
track_id_list.append(track_id)
track_name = result['tracks']['items'][i]['name']
track_name_list.append(track_name)
artist_name = result['tracks']['items'][i]['artists'][0]['name']
artist_name_list.append(artist_name)
popularity = result['tracks']['items'][i]['popularity']
popularity_list.append(popularity)
except:
print('Song not found')
track_id_list.append('None')
track_name_list.append('None')
href_list.append('None')
artist_name_list.append("None")
popularity_list.append('None')
track_info = pd.DataFrame({'song_id':track_id_list, 'track_name': track_name_list, 'artist_name': artist_name_list, 'popularity': popularity_list})
return track_info
# BLOCK B: FUNCTION TO SPLIT A LIST OR A DATAFRAME INTO SUBSETS OF ~50 ITEMS
# split list of song ids:
def chunks (song_ids, n:int =50)-> list:
"""
Divides a sequence of song IDs into chunks of a specified size.
Parameters:
- song_ids (list or pandas.DataFrame): The sequence of song IDs to be divided into chunks.
It can be either a list or a pandas DataFrame.
- n (int, optional): The desired size of each chunk. Default is 50.
Returns:
- list: A list containing chunks of song IDs, where each chunk has a maximum size of 'n'.
Note:
- If 'song_ids' is a list, the chunks are created using list slicing.
- If 'song_ids' is a pandas DataFrame, the chunks are created using DataFrame row slicing.
- If 'song_ids' is smaller than 'n', a single chunk containing all elements is returned.
"""
if len(song_ids) > n:
if type(song_ids) == list:
chunks = [song_ids[x:x+n] for x in range(0, len(song_ids), n)]
return chunks
elif type(song_ids) == pd.DataFrame:
chunks = [song_ids.iloc[x:x+n,] for x in range(0, len(song_ids), n)]
return chunks
else:
pass
else:
chunks = [song_ids]
return chunks
# BLOCK C: FUNCTION TO GET THE LIST OF SONG IDS
# getting list of spotify song ids
def get_list_song_ids(df,col_1:str="Song_title", col_2:str="Artist" ):
"""
Collects Spotify song IDs for a DataFrame containing song titles and optional artist information.
Parameters:
- df (pandas.DataFrame): The DataFrame containing song information, including titles and artists.
- col_1 (str, optional): The column name for song titles. Default is "Song_title".
- col_2 (str, optional): The column name for artist information. Default is "Artist".
Returns:
- tuple: A tuple containing two elements:
- list: A list of Spotify song IDs collected for the provided DataFrame. Note that None
may be present in the list for songs that were not found.
- pandas.DataFrame: A cleaned DataFrame with added 'song_id' column.
Note:
- The function uses the 'search_song' function to retrieve Spotify song IDs.
- It divides the DataFrame into chunks, performs Spotify searches for each chunk, and
includes a sleep interval to avoid exceeding rate limits.
- The resulting 'song_ids' list contains only non-None values (successfully found song IDs).
- The 'clean_hot_song' DataFrame is the original DataFrame with rows containing
None in the 'song_id' column dropped.
"""
df2 = df.copy()
hot_songs_dfs = chunks(df2)
song_ids = []
for index, hot_songs_df in enumerate(hot_songs_dfs):
print(f'Collecting spotify song_id for chunk {index}')
for i,row in hot_songs_df.iterrows():
try:
# search for spotify id
song_id = search_song (row[col_1], row[col_2])
song_ids.append(song_id)
except:
print("Song not found")
song_ids.append(None)
print("sleep a bit before getting the next chunk")
sleep(30)
df2['song_id'] = song_ids
song_ids_final = [value for value in song_ids if value is not None]
clean_hot_song = df2.dropna()
return song_ids_final, clean_hot_song
### BLOCK D: AUDIO feature: combine BLOCK 1 and BLOCK 2 in one FUNCTION
# getting audio features function from a list of 50 or 100 song ids:
def get_audio_features (list:list):
"""
Retrieves audio features from Spotify for a list of song IDs.
Parameters:
- list (list): A list of Spotify song IDs for which audio features will be retrieved.
Returns:
- tuple: A tuple containing two elements:
- dict: A dictionary containing audio features for each song ID. Keys include:
'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type',
'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'.
- pandas.DataFrame: A DataFrame representing the same audio features in a tabular format.
Note:
- The function uses the Spotify API to retrieve audio features for each song ID in the provided list.
- It divides the list into chunks to avoid exceeding rate limits.
- The resulting 'audio_features_dict' contains lists of values for each audio feature.
- 'audio_features_df' is a DataFrame created from 'audio_features_dict' for tabular representation.
- Rate limiting is handled, and the function waits between chunks to avoid API restrictions.
"""
sublists = chunks(list,100)
audio_features_dict ={'danceability':[], 'energy':[], 'key':[], 'loudness':[], 'mode':[], 'speechiness':[], 'acousticness':[],'instrumentalness':[], 'liveness':[], 'valence':[], 'tempo':[], 'type':[], 'id':[], 'uri':[], 'track_href':[], 'analysis_url':[], 'duration_ms':[], 'time_signature':[]}
for index,list in enumerate(sublists):
print(f"Retrieving audio_features from chunk {index}")
# get audio_features
try:
audio_features = sp.audio_features(list)
for feature in audio_features:
for key in audio_features_dict:
audio_features_dict[key].append(feature[key])
#audio_features['song_id'] = song_id # add dict item with key 'song_id' and value song_id
#audio_features_list.append(audio_features)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
retry_after = int(e.response.headers.get('Retry-After', 1))
print(f"Rate limited. Retrying after {retry_after} seconds.")
sleep(retry_after + 1)
continue
else:
raise
except Exception as e:
print(f"Failed to get audio features for some track IDs: {e}")
print("sleep a bit before getting the next chunk")
sleep(30)
audio_features_df = pd.DataFrame(audio_features_dict)
return audio_features_df
### BLOCK E: FUNCTION TO COMBINE DATAFRAME HOT SONG WITH DATAFRAME OF AUDIO FEATURES
# function to add audio features to song_name, artist df:
def add_audio_features (df1, df2, left_col, right_col, how = 'inner' ):
"""
Adds audio features from one DataFrame to another based on specified columns.
Parameters:
- df1 (pandas.DataFrame): The left DataFrame to which audio features will be added.
- df2 (pandas.DataFrame): The right DataFrame containing audio features to be added.
- left_col (str): The column in df1 used for merging.
- right_col (str): The column in df2 used for merging.
Returns:
- pandas.DataFrame: A new DataFrame resulting from the merge of df1 and df2 based on the specified columns.
Note:
- The function uses pandas' merge function to combine the two DataFrames.
- 'left_col' and 'right_col' are used as the merging keys.
- The resulting 'extended_df' DataFrame contains all columns from both DataFrames.
"""
extended_df = pd.merge(df1, df2, left_on=left_col, right_on=right_col, how = how)
return extended_df