-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.jl
41 lines (30 loc) · 1.23 KB
/
utils.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
using Mmap, CodecZlib, CSV, DataFrames
function get_vectors_duplicates(vector_dataframe, word_subset)
"""Get semantic matrix with colnames for a .csv.gz file, words in word_subset can occur multiple times
Arguments:
filepath: file with semantic vectors, in .csv.gz format
word_subset: DataFrame with (non necessarily unique) words which should have a semantic vector in the semantic matrix
sem_name_col: column in word_subset with the words
Returns:
S: semantic matrix
semvecs_col: all words present in the semantic matrix, in the same order as in the semantic matrix
"""
semvecs_col = vector_dataframe[:,1]
vector_dataframe = vector_dataframe[:,2:end]
print(vector_dataframe[1:5,1:5])
# convert to semantic matrix
semvecs = Matrix(vector_dataframe);
word_subset = word_subset[in.(word_subset, Ref(semvecs_col))]
new_length = length(word_subset)
print(new_length)
S = zeros(Float64, (new_length, size(semvecs, 2)))
rownames = []
for i in 1:length(word_subset)
w = word_subset[i]
if w in semvecs_col
S[i,:] = semvecs[semvecs_col .== w,:][1,:]
append!(rownames, [w])
end
end
S[1:length(rownames),:], rownames
end