-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
97 lines (58 loc) · 2.8 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
library("LDAvis")
library("aRxiv")
library("tm")
library("topicmodels")
dir <- getwd()
query <- 'cat:astro-ph* AND submittedDate:[20160401 TO 20160430]'
#arxiv_count(query)
#z <- arxiv_search(query,force=T,limit=20000)
z <- z.append(y)
#save(z,file = paste0(dir,'/sussex/arxiv_topicmodels/2016_arxiv.RData'))
load(dir+'/sussex/arxiv_topicmodels/2016_arxiv.RData')
# filter for papers with the primary category in astro-ph
astro_cats <- unique(z$primary_category)[grep('^astro-ph',unique(z$primary_category))]
dat <- z[z$primary_category %in% astro_cats,]
corpus <- tm::Corpus(tm::VectorSource(dat$abstract))
corpus.clean <- tm::tm_map(corpus, content_transformer(tolower), lazy = T)
corpus.clean <- tm::tm_map(corpus.clean, content_transformer(removePunctuation), lazy = T)
corpus.clean <- tm::tm_map(corpus.clean, content_transformer(removeNumbers), lazy = T)
corpus.clean <- tm::tm_map(corpus.clean, content_transformer(removeWords), stopwords('english'))
corpus.clean <- tm::tm_map(corpus.clean, content_transformer(stripWhitespace), lazy = T)
corpus.clean <- tm::tm_map(corpus.clean, content_transformer(removeWords), stopwords('english'))
corpus.clean <- tm::tm_map(corpus.clean, stemDocument)
dtm <- tm::DocumentTermMatrix(corpus.clean)
# filter out low scoring tf-idf terms
tfidf.scores <- colSums(as.matrix(tm::weightTfIdf(dtm)))
dtm <- dtm[,tfidf.scores > quantile(tfidf.scores, 0.3)]
# convert to matrix to allow row and column sums to be calculated
td.mat <- as.matrix(dtm)
topic.no <- 15
lda <- topicmodels::LDA(dtm, k = topic.no, method = "Gibbs")
phi <- posterior(lda)$terms
theta <- posterior(lda)$topics
doc.length <- rowSums(td.mat)
term.frequency <- colSums(td.mat)
vocab <- tm::Terms(dtm)
LDAvis.json <- LDAvis::createJSON(phi = phi,
theta = theta,
doc.length = doc.length,
vocab = vocab,
term.frequency = term.frequency)
library(RJSONIO)
topic_order <- RJSONIO::fromJSON(LDAvis.json)$topic.order
LDAvis::serVis(LDAvis.json)
## save data for Shiny app
# ldavis json
save(LDAvis.json, file=paste0(dir,'/sussex/arxiv_topicmodels/ldavis.RData'))
# small data frame of interesting features
theta_sorted <- data.frame(theta[,topic_order])
colnames(theta_sorted) <- paste("topic",1:15)
output_data <- data.frame(dat[,c("id","submitted","updated","title","link_abstract","primary_category")],theta_sorted)
save(output_data, file=paste0(dir,'/sussex/arxiv_topicmodels/topics.RData'))
## what categories does a topic represent?
# find the topic distribution for each category
top_topics <- lapply(astro_cats,FUN = function(x){
apply(theta[dat$primary_category == x,],2,mean)
})
# find the top topic for each category
data.frame(astro_cats,unlist(lapply(top_topics,which.max)))