Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prep DataModel for removal #1102

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
105 changes: 56 additions & 49 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy

import dedupe.variables
from dedupe.variables.base import CustomType
from dedupe.variables.base import FieldType as FieldVariable
from dedupe.variables.base import MissingDataType, Variable
from dedupe.variables.interaction import InteractionType
Expand All @@ -28,7 +29,7 @@
)
from dedupe.predicates import Predicate

VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k}
VARIABLE_CLASSES = {k: v for k, v in Variable.all_subclasses() if k}


class DataModel(object):
Expand All @@ -38,21 +39,23 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]):
variable_definitions = list(variable_definitions)
if not variable_definitions:
raise ValueError("The variable definitions cannot be empty")
all_variables: list[Variable]
self.primary_variables, all_variables = typify_variables(variable_definitions)
self._derived_start = len(all_variables)

all_variables += interactions(variable_definitions, self.primary_variables)
variables = typify_variables(variable_definitions)
non_interactions: list[FieldVariable] = [
v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is another case where I think a Variable should have a more official API for if they export if they are an Interaction or not. Maybe should think of a better name for this too. Atomic vs derived? leaf vs nonleaf? base vs derived?

]
self.primary_variables = non_interactions
expanded_primary = _expand_higher_variables(self.primary_variables)
self._derived_start = len(expanded_primary)

all_variables = expanded_primary.copy()
all_variables += _expanded_interactions(variables)
all_variables += missing(all_variables)

self._missing_field_indices = missing_field_indices(all_variables)
self._interaction_indices = interaction_indices(all_variables)

self._len = len(all_variables)

def __len__(self) -> int:
return self._len

# Changing this from a property to just a normal attribute causes
# pickling problems, because we are removing static methods from
# their class context. This could be fixed by defining comparators
Expand Down Expand Up @@ -82,7 +85,7 @@ def distances(
) -> numpy.typing.NDArray[numpy.float_]:
num_records = len(record_pairs)

distances = numpy.empty((num_records, len(self)), "f4")
distances = numpy.empty((num_records, self._len), "f4")

for i, (record_1, record_2) in enumerate(record_pairs):

Expand Down Expand Up @@ -142,13 +145,8 @@ def __setstate__(self, d):
self.__dict__ = d


def typify_variables(
variable_definitions: Iterable[VariableDefinition],
) -> tuple[list[FieldVariable], list[Variable]]:
primary_variables: list[FieldVariable] = []
all_variables: list[Variable] = []
only_custom = True

def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]:
variables: list[Variable] = []
for definition in variable_definitions:
try:
variable_type = definition["type"]
Expand All @@ -167,12 +165,6 @@ def typify_variables(
"{'field' : 'Phone', type: 'String'}"
)

if variable_type != "Custom":
only_custom = False

if variable_type == "Interaction":
continue

if variable_type == "FuzzyCategorical" and "other fields" not in definition:
definition["other fields"] = [ # type: ignore
d["field"]
Expand All @@ -183,30 +175,37 @@ def typify_variables(
try:
variable_class = VARIABLE_CLASSES[variable_type]
except KeyError:
valid = ", ".join(VARIABLE_CLASSES)
raise KeyError(
"Field type %s not valid. Valid types include %s"
% (definition["type"], ", ".join(VARIABLE_CLASSES))
f"Variable type {variable_type} not valid. Valid types include {valid}"
)

variable_object = variable_class(definition)
assert isinstance(variable_object, FieldVariable)
assert isinstance(variable_object, Variable)

primary_variables.append(variable_object)
variables.append(variable_object)

if hasattr(variable_object, "higher_vars"):
all_variables.extend(variable_object.higher_vars)
else:
variable_object = cast(Variable, variable_object)
all_variables.append(variable_object)

if only_custom:
no_blocking_variables = all(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Continuing from #1098 (comment)

I think it would be better if this was more duck-typed though, such as
no_blocking_variables = not any(len(v.predicates) for v in variables).

That would be a slight change though, since Variable.predicates is filled with an ExistsPredicate for any var def with the has missing flag. Perhaps this is OK? But I don't think so.

I think really there needs to be a more official API where variables declare the predicates they export.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code that add the ExistsPredicate should only really be on FieldPredicates as it depends on there being a field.

https://github.com/dedupeio/dedupe/blob/main/dedupe/variables/base.py#L37

if we move that code, then we could do pretty much what you are saying.

no_blocking_variables = not any(len(v.predicates) for v in variables)

we might also want to have a check like

only_exists = all(isinstance(pred, ExistsPredicate) for v in variables for pred in v.predicates)
if only_exists:
    warning.warn('You are only using the Exists Predicate which is usually pretty bad')

isinstance(v, (CustomType, InteractionType)) for v in variables
)
if no_blocking_variables:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this check should happen in DataModel.init, and not in this utility function?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems of a piece with the "Incorrect variable specification: variable" error?

raise ValueError(
"At least one of the variable types needs to be a type"
"other than 'Custom'. 'Custom' types have no associated"
"blocking rules"
"At least one of the variable types needs to be a type "
"other than 'Custom' or 'Interaction', "
"since these types have no associated blocking rules."
)

return primary_variables, all_variables
return variables


def _expand_higher_variables(variables: Iterable[Variable]) -> list[Variable]:
result: list[Variable] = []
for variable in variables:
if hasattr(variable, "higher_vars"):
result.extend(variable.higher_vars)
else:
result.append(variable)
return result
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if we added an iter method to Variable so that we could just do something like.

features = []
for variable in variables:
   for feature in variable:
       features.append(feature)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a good idea in general, but I would want to name it to differentiate between predicates and features, since Variables supply both. Perhaps Variable.features?



def missing(variables: list[Variable]) -> list[MissingDataType]:
Expand All @@ -217,16 +216,12 @@ def missing(variables: list[Variable]) -> list[MissingDataType]:
return missing_variables


def interactions(
definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable]
) -> list[InteractionType]:
field_d = {field.name: field for field in primary_variables}

def _expanded_interactions(variables: list[Variable]) -> list[InteractionType]:
field_vars = {var.name: var for var in variables if isinstance(var, FieldVariable)}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, this should feels like it needs a more official API, or should be responsibility of the InteractionType

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i can think of a few designs here, but this code is actually doing two things for us, both of which are kind of important.

  1. isolating the interaction features for us, so we can put them in the right place in the sequence of features.
  2. actually resolving the feature indices.

seems like we kind of need to do both.

interactions = []
for definition in definitions:
if definition["type"] == "Interaction":
var = InteractionType(definition)
var.expandInteractions(field_d)
for var in variables:
if isinstance(var, InteractionType):
var.expandInteractions(field_vars)
interactions.extend(var.higher_vars)
return interactions

Expand All @@ -236,15 +231,27 @@ def missing_field_indices(variables: list[Variable]) -> list[int]:


def interaction_indices(variables: list[Variable]) -> list[list[int]]:
var_names = [var.name for var in variables]
_ensure_unique_names(variables)
name_to_index = {var.name: i for i, var in enumerate(variables)}
indices = []
for var in variables:
if hasattr(var, "interaction_fields"):
interaction_indices = [var_names.index(f) for f in var.interaction_fields] # type: ignore
interaction_indices = [name_to_index[f] for f in var.interaction_fields] # type: ignore
indices.append(interaction_indices)
return indices


def _ensure_unique_names(variables: Iterable[Variable]) -> None:
seen = set()
for var in variables:
if var.name in seen:
raise ValueError(
"Variable name used more than once! "
"Choose a unique name for each variable: '{var.name}'"
)
seen.add(var.name)


def reduce_method(m): # type: ignore[no-untyped-def]
return (getattr, (m.__self__, m.__func__.__name__))

Expand Down
9 changes: 2 additions & 7 deletions dedupe/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,21 +58,16 @@ def all_subclasses(


class DerivedType(Variable):
type = "Derived"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you remove this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be wrong but my understanding is:

This would only be used if someone passes in a variable definition like {"type": "Derived"}, and then we would create an instance of DerivedType, but that doesn't make sense because DerivedType is an abstract base class and shouldn't ever be created directly.


def __init__(self, definition: VariableDefinition):
self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"]))
super(DerivedType, self).__init__(definition)


class MissingDataType(Variable):
type = "MissingData"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you remove this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reason as for DerivedType

has_missing = False

def __init__(self, name: str):

self.name = "(%s: Not Missing)" % name

self.has_missing = False
self.name = f"({name}: Not Missing)"


class FieldType(Variable):
Expand Down
53 changes: 53 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,13 @@ def test_initialize_fields(self):
[],
)

# only customs
with self.assertRaises(ValueError):
dedupe.api.ActiveMatching(
[{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}],
)

# Only customs
with self.assertRaises(ValueError):
dedupe.api.ActiveMatching(
[
Expand All @@ -71,13 +73,64 @@ def test_initialize_fields(self):
],
)

# Only custom and interactions
with self.assertRaises(ValueError):
dedupe.api.ActiveMatching(
[
{"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
{"field": "age", "type": "Custom", "comparator": lambda x, y: 1},
{"type": "Interaction", "interaction variables": ["name", "age"]},
],
)

# Only interactions
with self.assertRaises(ValueError):
dedupe.api.ActiveMatching(
[
{"type": "Interaction", "interaction variables": []},
],
)

# Duplicate variable names (explicitly)
with self.assertRaises(ValueError) as e:
dedupe.api.ActiveMatching(
[
{"field": "age", "type": "String", "variable name": "my_age"},
{"field": "age", "type": "ShortString", "variable name": "my_age"},
],
)
assert "Variable name used more than once!" in str(e.exception)

# Duplicate variable names (implicitly)
with self.assertRaises(ValueError) as e:
dedupe.api.ActiveMatching(
[
{"field": "age", "type": "String"},
{"field": "age", "type": "String"},
],
)
assert "Variable name used more than once!" in str(e.exception)

dedupe.api.ActiveMatching(
[
{"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
{"field": "age", "type": "String"},
],
)

dedupe.api.ActiveMatching(
[
{"field": "name", "variable name": "name", "type": "String"},
{
"field": "age",
"variable name": "age",
"type": "Custom",
"comparator": lambda x, y: 1,
},
{"type": "Interaction", "interaction variables": ["name", "age"]},
],
)

def test_check_record(self):
matcher = dedupe.api.ActiveMatching(self.field_definition)

Expand Down