-
Notifications
You must be signed in to change notification settings - Fork 550
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prep DataModel for removal #1102
base: main
Are you sure you want to change the base?
Changes from 10 commits
22c54bb
9a34c00
a6504d5
4871eae
8dc4b80
442a65a
4eb1aa1
35b53cb
2fdbe6e
2d9a6a3
5482c21
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
import numpy | ||
|
||
import dedupe.variables | ||
from dedupe.variables.base import CustomType | ||
from dedupe.variables.base import FieldType as FieldVariable | ||
from dedupe.variables.base import MissingDataType, Variable | ||
from dedupe.variables.interaction import InteractionType | ||
|
@@ -28,7 +29,7 @@ | |
) | ||
from dedupe.predicates import Predicate | ||
|
||
VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} | ||
VARIABLE_CLASSES = {k: v for k, v in Variable.all_subclasses() if k} | ||
|
||
|
||
class DataModel(object): | ||
|
@@ -38,21 +39,23 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): | |
variable_definitions = list(variable_definitions) | ||
if not variable_definitions: | ||
raise ValueError("The variable definitions cannot be empty") | ||
all_variables: list[Variable] | ||
self.primary_variables, all_variables = typify_variables(variable_definitions) | ||
self._derived_start = len(all_variables) | ||
|
||
all_variables += interactions(variable_definitions, self.primary_variables) | ||
variables = typify_variables(variable_definitions) | ||
non_interactions: list[FieldVariable] = [ | ||
v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc] | ||
] | ||
self.primary_variables = non_interactions | ||
expanded_primary = _expand_higher_variables(self.primary_variables) | ||
self._derived_start = len(expanded_primary) | ||
|
||
all_variables = expanded_primary.copy() | ||
all_variables += _expanded_interactions(variables) | ||
all_variables += missing(all_variables) | ||
|
||
self._missing_field_indices = missing_field_indices(all_variables) | ||
self._interaction_indices = interaction_indices(all_variables) | ||
|
||
self._len = len(all_variables) | ||
|
||
def __len__(self) -> int: | ||
return self._len | ||
|
||
# Changing this from a property to just a normal attribute causes | ||
# pickling problems, because we are removing static methods from | ||
# their class context. This could be fixed by defining comparators | ||
|
@@ -82,7 +85,7 @@ def distances( | |
) -> numpy.typing.NDArray[numpy.float_]: | ||
num_records = len(record_pairs) | ||
|
||
distances = numpy.empty((num_records, len(self)), "f4") | ||
distances = numpy.empty((num_records, self._len), "f4") | ||
|
||
for i, (record_1, record_2) in enumerate(record_pairs): | ||
|
||
|
@@ -142,13 +145,8 @@ def __setstate__(self, d): | |
self.__dict__ = d | ||
|
||
|
||
def typify_variables( | ||
variable_definitions: Iterable[VariableDefinition], | ||
) -> tuple[list[FieldVariable], list[Variable]]: | ||
primary_variables: list[FieldVariable] = [] | ||
all_variables: list[Variable] = [] | ||
only_custom = True | ||
|
||
def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]: | ||
variables: list[Variable] = [] | ||
for definition in variable_definitions: | ||
try: | ||
variable_type = definition["type"] | ||
|
@@ -167,12 +165,6 @@ def typify_variables( | |
"{'field' : 'Phone', type: 'String'}" | ||
) | ||
|
||
if variable_type != "Custom": | ||
only_custom = False | ||
|
||
if variable_type == "Interaction": | ||
continue | ||
|
||
if variable_type == "FuzzyCategorical" and "other fields" not in definition: | ||
definition["other fields"] = [ # type: ignore | ||
d["field"] | ||
|
@@ -183,30 +175,37 @@ def typify_variables( | |
try: | ||
variable_class = VARIABLE_CLASSES[variable_type] | ||
except KeyError: | ||
valid = ", ".join(VARIABLE_CLASSES) | ||
raise KeyError( | ||
"Field type %s not valid. Valid types include %s" | ||
% (definition["type"], ", ".join(VARIABLE_CLASSES)) | ||
f"Variable type {variable_type} not valid. Valid types include {valid}" | ||
) | ||
|
||
variable_object = variable_class(definition) | ||
assert isinstance(variable_object, FieldVariable) | ||
assert isinstance(variable_object, Variable) | ||
|
||
primary_variables.append(variable_object) | ||
variables.append(variable_object) | ||
|
||
if hasattr(variable_object, "higher_vars"): | ||
all_variables.extend(variable_object.higher_vars) | ||
else: | ||
variable_object = cast(Variable, variable_object) | ||
all_variables.append(variable_object) | ||
|
||
if only_custom: | ||
no_blocking_variables = all( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Continuing from #1098 (comment) I think it would be better if this was more duck-typed though, such as That would be a slight change though, since I think really there needs to be a more official API where variables declare the predicates they export. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the code that add the ExistsPredicate should only really be on FieldPredicates as it depends on there being a field. https://github.com/dedupeio/dedupe/blob/main/dedupe/variables/base.py#L37 if we move that code, then we could do pretty much what you are saying. no_blocking_variables = not any(len(v.predicates) for v in variables) we might also want to have a check like only_exists = all(isinstance(pred, ExistsPredicate) for v in variables for pred in v.predicates)
if only_exists:
warning.warn('You are only using the Exists Predicate which is usually pretty bad') |
||
isinstance(v, (CustomType, InteractionType)) for v in variables | ||
) | ||
if no_blocking_variables: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this check should happen in DataModel.init, and not in this utility function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it seems of a piece with the "Incorrect variable specification: variable" error? |
||
raise ValueError( | ||
"At least one of the variable types needs to be a type" | ||
"other than 'Custom'. 'Custom' types have no associated" | ||
"blocking rules" | ||
"At least one of the variable types needs to be a type " | ||
"other than 'Custom' or 'Interaction', " | ||
"since these types have no associated blocking rules." | ||
) | ||
|
||
return primary_variables, all_variables | ||
return variables | ||
|
||
|
||
def _expand_higher_variables(variables: Iterable[Variable]) -> list[Variable]: | ||
result: list[Variable] = [] | ||
for variable in variables: | ||
if hasattr(variable, "higher_vars"): | ||
result.extend(variable.higher_vars) | ||
else: | ||
result.append(variable) | ||
return result | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if we added an iter method to Variable so that we could just do something like.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is a good idea in general, but I would want to name it to differentiate between predicates and features, since Variables supply both. Perhaps |
||
|
||
|
||
def missing(variables: list[Variable]) -> list[MissingDataType]: | ||
|
@@ -217,16 +216,12 @@ def missing(variables: list[Variable]) -> list[MissingDataType]: | |
return missing_variables | ||
|
||
|
||
def interactions( | ||
definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] | ||
) -> list[InteractionType]: | ||
field_d = {field.name: field for field in primary_variables} | ||
|
||
def _expanded_interactions(variables: list[Variable]) -> list[InteractionType]: | ||
field_vars = {var.name: var for var in variables if isinstance(var, FieldVariable)} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, this should feels like it needs a more official API, or should be responsibility of the InteractionType There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i can think of a few designs here, but this code is actually doing two things for us, both of which are kind of important.
seems like we kind of need to do both. |
||
interactions = [] | ||
for definition in definitions: | ||
if definition["type"] == "Interaction": | ||
var = InteractionType(definition) | ||
var.expandInteractions(field_d) | ||
for var in variables: | ||
if isinstance(var, InteractionType): | ||
var.expandInteractions(field_vars) | ||
interactions.extend(var.higher_vars) | ||
return interactions | ||
|
||
|
@@ -236,15 +231,27 @@ def missing_field_indices(variables: list[Variable]) -> list[int]: | |
|
||
|
||
def interaction_indices(variables: list[Variable]) -> list[list[int]]: | ||
var_names = [var.name for var in variables] | ||
_ensure_unique_names(variables) | ||
name_to_index = {var.name: i for i, var in enumerate(variables)} | ||
indices = [] | ||
for var in variables: | ||
if hasattr(var, "interaction_fields"): | ||
interaction_indices = [var_names.index(f) for f in var.interaction_fields] # type: ignore | ||
interaction_indices = [name_to_index[f] for f in var.interaction_fields] # type: ignore | ||
indices.append(interaction_indices) | ||
return indices | ||
|
||
|
||
def _ensure_unique_names(variables: Iterable[Variable]) -> None: | ||
seen = set() | ||
for var in variables: | ||
if var.name in seen: | ||
raise ValueError( | ||
"Variable name used more than once! " | ||
"Choose a unique name for each variable: '{var.name}'" | ||
) | ||
seen.add(var.name) | ||
|
||
|
||
def reduce_method(m): # type: ignore[no-untyped-def] | ||
return (getattr, (m.__self__, m.__func__.__name__)) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,21 +58,16 @@ def all_subclasses( | |
|
||
|
||
class DerivedType(Variable): | ||
type = "Derived" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did you remove this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might be wrong but my understanding is: This would only be used if someone passes in a variable definition like |
||
|
||
def __init__(self, definition: VariableDefinition): | ||
self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"])) | ||
super(DerivedType, self).__init__(definition) | ||
|
||
|
||
class MissingDataType(Variable): | ||
type = "MissingData" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did you remove this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason as for DerivedType |
||
has_missing = False | ||
|
||
def __init__(self, name: str): | ||
|
||
self.name = "(%s: Not Missing)" % name | ||
|
||
self.has_missing = False | ||
self.name = f"({name}: Not Missing)" | ||
|
||
|
||
class FieldType(Variable): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is another case where I think a Variable should have a more official API for if they export if they are an Interaction or not. Maybe should think of a better name for this too. Atomic vs derived? leaf vs nonleaf? base vs derived?