Skip to content

Commit

Permalink
inlcude dj profile page URL in djKey
Browse files Browse the repository at this point in the history
  • Loading branch information
apsteinmetz committed Jan 19, 2025
1 parent 5af6cd1 commit 72b053a
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 41 deletions.
79 changes: 54 additions & 25 deletions 1 - scrape_playlists.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ library(rvest)
library(stringr)
library(xml2)
library(tidyverse)

library(progress)



Expand All @@ -14,6 +14,7 @@ ROOT_URL<-"http://wfmu.org"

#-------------------------------------------
getDJURLs <- function(){

rawDJURLs<- read_html(paste(ROOT_URL,"/playlists",sep=""))
# get the urls of the each DJs RSS playlist feed
t<-rawDJURLs%>%html_nodes(xpath='//html//body//center[2]//table[1]//table//a[contains(.,"Playlists")]') %>%
Expand All @@ -40,31 +41,40 @@ getDJsOffSched <- function(){
#---------------------------------------------------
# get the shownames for a DJ
getShowNames<-function(DJURLs) {
DJKey <- data.frame()
pb <- progress_bar$new(
format = " Getting Show :what [:bar] :percent eta: :eta",
clear = FALSE, total = length(DJURLs))
djKey <- data.frame()
for (page in DJURLs) {
singleDJ<- read_html(page)
showName <- html_node(singleDJ,"title")%>%html_text()
showName <- gsub("\n","",sub("Playlists and Archives for ","",showName))
showName<-str_replace(showName,'WFMU:',"")
showName<-str_replace_all(showName,':Playlists and Archives',"")
DJ <- sub("http://wfmu.org/playlists/","",page)
DJKey<-rbind(DJKey,data.frame(DJ=DJ,ShowName=showName))
print(showName)
profileURL<-singleDJ%>%
html_nodes(xpath="//a[contains(@href,'profile')]") %>%
html_attr("href")
# if profile URL is not found, use the DJ URL
if (length(profileURL)==0) profileURL<-page
# print(DJ)
djKey<-rbind(djKey,data.frame(DJ=DJ,ShowName=showName,profileURL=profileURL))
pb$tick(tokens = list(what = DJ))
}
# now identifty those DJs which are currently ON MIC
DJKey$onSched <- 'YES'
DJKey$onSched[which(DJKey$DJ %in% getDJsOffSched())]<-'NO'
djKey$onSched <- 'YES'
djKey$onSched[which(djKey$DJ %in% getDJsOffSched())]<-'NO'
#strip "WFMU" and "Playlists and Archives" and some punctuation
DJKey$ShowName<-str_replace_all(DJKey$ShowName,"(P|p)laylists (and|&) (A|a)rchives","")
DJKey$ShowName<-str_replace_all(DJKey$ShowName,"-","")
DJKey$ShowName<-str_replace_all(DJKey$ShowName,"(P|p)laylist|(R|r)ecent","")
DJKey$ShowName<-str_replace_all(DJKey$ShowName,"WFMU|wfmu","")
DJKey$ShowName<-str_replace_all(DJKey$ShowName,"The ","")
DJKey$ShowName<-str_trim(DJKey$ShowName)
djKey$ShowName<-str_replace_all(djKey$ShowName,"(P|p)laylists (and|&) (A|a)rchives","")
djKey$ShowName<-str_replace_all(djKey$ShowName,"-","")
djKey$ShowName<-str_replace_all(djKey$ShowName,"(P|p)laylist|(R|r)ecent","")
djKey$ShowName<-str_replace_all(djKey$ShowName,"WFMU|wfmu","")
djKey$ShowName<-str_replace_all(djKey$ShowName,"The ","")
djKey$ShowName<-str_trim(djKey$ShowName)


return (DJKey)
#save(DJKey,file = "data/DJKey.rdata")
return (djKey)
#save(djKey,file = "data/djKey.rdata")
}

# -------------get the URLs of the playlist pages for a DJ ----------
Expand Down Expand Up @@ -97,7 +107,7 @@ return(pl_url)
getDJPlaylistURLs<-function(music_djs) {
DJ_playlists = NULL
dudList<-NULL
#DJKey = data.frame()
#djKey = data.frame()
for (dj in music_djs) {
print(dj)
url_suffixes<-get_playlist_page_URLs(dj)
Expand Down Expand Up @@ -129,6 +139,25 @@ getDJPlaylistURLs<-function(music_djs) {
return(DJ_playlists)
}

# get profile page URL by extracting href containing the word "profile" from the DJ page
getDJProfileURLs<-function(DJURLs) {
pb <- progress_bar$new(total = length(DJURLs))
DJProfileURLs = NULL
for (page in DJURLs) {
singleDJ<- read_html(page)
DJ <- sub("http://wfmu.org/playlists/","",page)
profileURL<-singleDJ%>%
html_nodes(xpath="//a[contains(@href,'profile')]") %>%
html_attr("href")
# profileURL<-as.character(na.omit(profileURL[str_detect(profileURL,"profile")]))
if (length(profileURL)>0) {
DJProfileURLs = bind_rows(DJProfileURLs, tibble(DJ=DJ,profileURL = profileURL))
}
pb$tick()
}
return(DJProfileURLs)
}

#-------------------------------------------------
# Get all Artists ever played by a DJ
#WFMU maintains this as a separate page
Expand All @@ -141,7 +170,7 @@ getDJArtistNames<-function(DJURLs) {
showName <- html_node(singleDJ,"title")%>%html_text()
showName <- gsub("\n","",sub("Playlists and Archives for ","",showName))
DJ <- sub("http://wfmu.org/playlists/","",page)
DJKey<-rbind(DJKey,data.frame(DJ=DJ,ShowName=showName))
djKey<-rbind(djKey,data.frame(DJ=DJ,ShowName=showName))
print(showName)
artistListPage <- paste(ROOT_URL,URL_BRANCH,DJ, sep="")
artistList<-read_html(artistListPage)%>%html_node(xpath="//body/div")%>%html_text()%>%str_split("\n")
Expand Down Expand Up @@ -379,9 +408,9 @@ get_playlist <- function(plURL="/playlists/shows/93065", dj = "WA") {
}
#-------------- MAIN -----------------
DJURLs<-getDJURLs()
DJKey<-getShowNames(DJURLs)
save(DJKey,file = "data/DJKey.rdata")
#load(file='data/djkey.rdata')
djKey<-getShowNames(DJURLs)
save(djKey,file = "data/djKey.rdata")
#load(file='data/djKey.rdata')

excludeDJs <-
c('SD',
Expand Down Expand Up @@ -409,7 +438,7 @@ excludeDJs <-
'TP',
'RC',
'VC')
music_djs<-DJKey %>%
music_djs<-djKey %>%
select(DJ) %>%
anti_join(tibble(DJ=excludeDJs)) %>%
pull(DJ)
Expand All @@ -418,14 +447,14 @@ showCounts<-playlistURLs %>%
group_by(DJ) %>%
summarise(showCount=n()) %>%
arrange(desc(showCount))
DJKey<-left_join(DJKey,showCounts) %>% drop_na()
save(DJKey,file = "data/DJKey.rdata")
djKey<-left_join(djKey,showCounts) %>% drop_na()
save(djKey,file = "data/djKey.rdata")

#limit analysis to DJs with at least numShows shows.
# This also excludes DJs where we couldn't extract valid playlist URLs.
numShows <- 10
# non-music shows
djList <- DJKey %>%
djList <- djKey %>%
filter(showCount > numShows, !(DJ %in% excludeDJs)) %>%
pull(DJ)

Expand Down Expand Up @@ -479,12 +508,12 @@ for (dj in djList_temp) {
save(playlists_raw,file = "data/playlists_raw.rdata")
}

bad_Tables<-anti_join(tibble(DJ=djList),playlists_raw) %>% left_join(DJKey)
bad_Tables<-anti_join(tibble(DJ=djList),playlists_raw) %>% left_join(djKey)

playlists_raw<-playlists_raw %>%
filter(Artist != Title) %>% #single column span across table. Not a song.
distinct()

save(playlists_raw,file = "data/playlists_raw.rdata")
right_join(DJKey,bad_Tables) %>% save(file = "data/bad_tables.rdata")
right_join(djKey,bad_Tables) %>% save(file = "data/bad_tables.rdata")

22 changes: 11 additions & 11 deletions 2 - clean_playlists.r
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set_collapse(mask = NULL)
#clean up raw playlists

load("data/playlists_raw.rdata")
load("data/djkey.rdata")
load("data/djKey.rdata")

#Clean up inconsistent artist names

Expand Down Expand Up @@ -272,11 +272,11 @@ show_count<-playlists %>%
group_by(DJ) |>
summarise(showCount=n())

DJKey<-DJKey %>%
djKey<-djKey %>%
select(-showCount) %>%
left_join(show_count) %>%
distinct()
# save(DJKey,file = "data/DJKey.rdata")
# save(djKey,file = "data/djKey.rdata")


#use artisttoken to select the most common version of the artist name and make that the token.
Expand Down Expand Up @@ -308,7 +308,7 @@ playlists <- playlists %>%
# save(playlists,file = "data/playlists.rdata")
# write_csv(playlists,path="playlists.csv")

# add first show and last show to djkey
# add first show and last show to djKey
FirstShow<-playlists %>%
group_by(DJ) %>%
select(DJ,AirDate) %>%
Expand All @@ -321,14 +321,14 @@ LastShow<-playlists %>%
distinct() %>%
top_n(1) %>% rename(LastShow=AirDate)

DJKey <- DJKey %>%
djKey <- djKey %>%
select(DJ,ShowName,onSched,showCount) %>%
left_join(FirstShow,by=c("DJ")) %>%
left_join(LastShow,by=c("DJ"))

DJKey <- select(playlists,DJ) |>
djKey <- select(playlists,DJ) |>
distinct() |>
left_join(DJKey)
left_join(djKey)

# save unique artisttokens as parquet
cat("Saving unique artist tokens as rdata\n")
Expand All @@ -340,10 +340,10 @@ all_artisttokens <- playlists |>
# save as rdata
save(all_artisttokens, file = "data/all_artisttokens.rdata")

cat("Saving DJKey.parquet\n")
# save(DJKey, file = "data/DJKey.RData")
# save DJKey as parquet
duckplyr::df_to_parquet(DJKey, "data/DJKey.parquet")
cat("Saving djKey.parquet\n")
# save(djKey, file = "data/djKey.RData")
# save djKey as parquet
duckplyr::df_to_parquet(djKey, "data/djKey.parquet")


# save(playlists,file = "data/playlists.rdata")
Expand Down
24 changes: 19 additions & 5 deletions 4 - save files parquet.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
library(arrow)
library(purrr)
library(glue)

# copy precalculated data over to shiny app directory

#load(file = "data/DJKey.RData")
#load(file = "data/djKey.RData")
# load(file = "data/playlists.Rdata")
# load(file = "data/djSimilarity.RData")
# load(file = "data/distinctive_artists.RData")
load(file = "data/djdtm.RData")
load(file = "all_artisttokens.rdata")
# load(file = "all_artisttokens.rdata")


library(fs)

tables = c("DJKey",
tables = c("djKey",
"playlists",
"dj_similarity_tidy",
"distinctive_artists")
Expand All @@ -25,13 +26,26 @@ file_ext = ".parquet"
# sink=paste0("data/",file_stem,file_ext))
#}

fs::file_copy("data/djKey.RData",
"../wfmu_explorer/data/djKey.RData",
overwrite = TRUE)



copy_parquet_to_shiny <- function(table){
fs::file_copy(glue("data/{table}.parquet"),
glue("../wfmu_explorer/data/{table}.parquet"),
overwrite = TRUE)
}

save_parquet_to_shiny <- function(file_stem){
arrow::write_parquet(eval(parse(text=file_stem)),
sink=paste0("../wfmu_explorer/data/",file_stem,file_ext))
}

# tables |> walk(save_parquet_to_local)
tables |> walk(save_parquet_to_shiny)
tables |> walk(copy_parquet_to_shiny)
#tables |> walk(save_parquet_to_shiny)
save(djdtm,file="../wfmu_explorer/data/djdtm.rdata")
save(all_artisttokens,file="../wfmu_explorer/data/all_artisttokens.rdata")
# save(all_artisttokens,file="../wfmu_explorer/data/all_artisttokens.rdata")

Binary file modified data/DJKey.parquet
Binary file not shown.
Binary file added data/djKey.rdata
Binary file not shown.

0 comments on commit 72b053a

Please sign in to comment.