-
Notifications
You must be signed in to change notification settings - Fork 68
/
Copy pathskill_extraction_metrics.py
228 lines (181 loc) · 9.97 KB
/
skill_extraction_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import json
import logging
from abc import ABCMeta, abstractmethod
from skills_ml.job_postings.common_schema import get_onet_occupation
from skills_ml.ontologies.base import CompetencyOntology
from skills_ml.algorithms.skill_extractors.base import CandidateSkillYielder
from skills_ml.algorithms.sampling import Sample
from collections import defaultdict
from typing import List
import numpy
import statistics
class SkillExtractorMetric(metaclass=ABCMeta):
@abstractmethod
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> float:
pass
class OntologyCompetencyRecall(SkillExtractorMetric):
"""The percentage of competencies in an ontology which are present in the candidate skills"""
@property
def name(self):
return f'{self.ontology.competency_framework.name}_competency_recall'
def __init__(self, ontology: CompetencyOntology):
self.ontology = ontology
self.lookup = set(competency.identifier for competency in ontology.competencies)
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> float:
num_total_terms = len(self.lookup)
if num_total_terms == 0:
logging.warning('Lookup has zero terms, cannot evaluate. Returning 0')
return 0
found_terms = set()
for candidate_skill in candidate_skills:
if candidate_skill.matched_skill_identifier in found_terms:
continue
if candidate_skill.matched_skill_identifier in self.lookup:
found_terms.add(candidate_skill.matched_skill_identifier)
num_found_terms = len(found_terms)
logging.info('Found %s terms out of %s total', num_found_terms, num_total_terms)
return float(num_found_terms)/num_total_terms
class OntologyOccupationRecall(SkillExtractorMetric):
"""The percentage of occupations in the ontology that are present in the candidate skills"""
@property
def name(self):
return f'{self.ontology.name}_occupation_recall'
def __init__(self, ontology: CompetencyOntology):
self.ontology = ontology
self.lookup = set(occupation.identifier.lower() for occupation in ontology.occupations)
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> float:
num_total_occupations = len(self.lookup)
num_total_terms = len(self.lookup)
if num_total_terms == 0:
logging.warning('Lookup has zero terms, cannot evaluate. Returning 0')
return 0
found_occupations = set()
for candidate_skill in candidate_skills:
occupation = get_onet_occupation(candidate_skill.source_object)
if occupation and occupation not in found_occupations:
found_occupations.add(occupation)
num_found_occupations = len(found_occupations)
logging.info('Found %s occupations out of %s total', num_found_occupations, num_total_occupations)
return float(num_found_occupations)/num_total_occupations
class MedianSkillsPerDocument(SkillExtractorMetric):
"""The median number of distinct skills present in each document"""
name = 'median_skills_per_document'
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> float:
skills_in_document = defaultdict(set)
for candidate_skill in candidate_skills:
if candidate_skill.skill_name not in skills_in_document[candidate_skill.document_id]:
skills_in_document[candidate_skill.document_id].add(candidate_skill.skill_name)
documents_with_skills = len(skills_in_document.values())
counts = [len(skill_list) for skill_list in skills_in_document.values()]
for _ in range(0, sample_len - documents_with_skills):
counts.append(0)
return statistics.median(counts)
class SkillsPerDocumentHistogram(SkillExtractorMetric):
"""The"""
def __init__(self, bins=10, *args, **kwargs):
super().__init__(*args, **kwargs)
self.bins = bins
@property
def name(self):
return f'skills_per_document_histogram_{self.bins}bins'
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> List:
skills_in_document = defaultdict(set)
for candidate_skill in candidate_skills:
if candidate_skill.skill_name not in skills_in_document[candidate_skill.document_id]:
skills_in_document[candidate_skill.document_id].add(candidate_skill.skill_name)
documents_with_skills = len(skills_in_document.values())
counts = [len(skill_list) for skill_list in skills_in_document.values()]
for _ in range(0, sample_len - documents_with_skills):
counts.append(0)
return list(numpy.histogram(counts, bins=self.bins)[0])
class PercentageNoSkillDocuments(SkillExtractorMetric):
"""The percentage of documents that contained zero skills"""
name = 'pct_no_skill_documents'
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> float:
documents_with_skills = set()
for candidate_skill in candidate_skills:
if candidate_skill.document_id not in documents_with_skills:
documents_with_skills.add(candidate_skill.document_id)
return (sample_len - len(documents_with_skills)) / sample_len
class TotalVocabularySize(SkillExtractorMetric):
"""The total number of skills represented"""
name = 'total_vocabulary_size'
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> int:
skills = set()
for candidate_skill in candidate_skills:
if candidate_skill.skill_name not in skills:
skills.add(candidate_skill.skill_name)
return len(skills)
class TotalOccurrences(SkillExtractorMetric):
"""The total number of candidate skill occurrences"""
name = 'total_candidate_skills'
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> int:
return sum(1 for candidate_skill in candidate_skills)
strict_candidate_key = lambda cs: (cs.document_id, cs.document_type, cs.skill_name, cs.start_index)
nonstrict_candidate_key = lambda cs: (cs.document_id, cs.document_type, cs.skill_name)
class EvaluationSetPrecision(SkillExtractorMetric):
"""Find the precision evaluated against an evaluation set of candidate skills.
Args:
candidate_skills (CandidateSkillYielder): A collection of candidate skills to evaluate against
evaluation_set_name (str): A name for the evaluation set of candidate skills.
Used in the name of the metric so results from multiple evaluation sets
can be compared side-by-side.
strict (bool, default True): Whether or not to enforce the exact location of the match,
versus just matching between sets on the same skill name and document.
Setting this to False will guard against:
1. labelers who don't mark every instance of a skill once they found one instance
2. discrepancies in start_index values caused by errant transformation methods
However, this could also produce false matches, so use with care.
"""
@property
def name(self):
strictstring = 'strict' if self.strict else 'nonstrict'
return f'{self.evaluation_set_name}_evaluation_set_precision_{strictstring}'
def __init__(
self,
candidate_skills: CandidateSkillYielder,
evaluation_set_name: str,
strict: bool=True
):
self.strict = strict
self.keyfunc = strict_candidate_key if strict else nonstrict_candidate_key
self.gold_standard_candidate_skills = set(self.keyfunc(candidate_skill) for candidate_skill in candidate_skills)
self.evaluation_set_name = evaluation_set_name
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> int:
num_candidates_in_gs = 0
total_candidates = 0
for candidate_skill in candidate_skills:
total_candidates += 1
if self.keyfunc(candidate_skill) in self.gold_standard_candidate_skills:
num_candidates_in_gs += 1
return float(num_candidates_in_gs) / total_candidates
class EvaluationSetRecall(SkillExtractorMetric):
"""Find the recall evaluated against an evaluation set of candidate skills.
Args:
candidate_skills (CandidateSkillYielder): A collection of candidate skills to evaluate against
evaluation_set_name (str): A name for the evaluation set of candidate skills.
Used in the name of the metric so results from multiple evaluation sets
can be compared side-by-side.
strict (bool, default True): Whether or not to enforce the exact location of the match,
versus just matching between sets on the same skill name and document.
Setting this to False will guard against:
1. labelers who don't mark every instance of a skill once they found one instance
2. discrepancies in start_index values caused by errant transformation methods
However, this could also produce false matches, so use with care.
"""
@property
def name(self):
strictstring = 'strict' if self.strict else 'nonstrict'
return f'{self.evaluation_set_name}_evaluation_set_recall_{strictstring}'
def __init__(self, candidate_skills, evaluation_set_name, strict=True):
self.strict = strict
self.keyfunc = strict_candidate_key if strict else nonstrict_candidate_key
self.gold_standard_candidate_skills = set(self.keyfunc(candidate_skill) for candidate_skill in candidate_skills)
self.evaluation_set_name = evaluation_set_name
def eval(self, candidate_skills: CandidateSkillYielder, sample_len: int) -> int:
found_candidate_skills = set(self.keyfunc(candidate_skill) for candidate_skill in candidate_skills)
num_candidates_found = 0
for cs in self.gold_standard_candidate_skills:
if cs in found_candidate_skills:
num_candidates_found += 1
return float(num_candidates_found) / len(self.gold_standard_candidate_skills)