Skip to content

Commit

Permalink
Item Categorization Part 7: Add is_entity and has_entity operator
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 713361348
Change-Id: Id2b52a551c53e9fdf43703ccb7e012cb913e4d96
  • Loading branch information
dilumich authored and copybara-github committed Jan 8, 2025
1 parent 5c4cfac commit b382a76
Show file tree
Hide file tree
Showing 13 changed files with 548 additions and 99 deletions.
194 changes: 117 additions & 77 deletions docs/cheatsheet.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ ds.display()

### Entities

Entities can be thought as instances of protos or C++ structs. That is, they
Entities can be thought of as instances of protos or C++ structs. That is, they
don't directly store their own schema. Instead, their schema is stored at
DataSlice level and all entities in a DataSlice share the same schema.

Expand All @@ -157,6 +157,8 @@ es = kd.new(x=kd.slice([1, 2, None]),
e.get_schema()
assert e.get_schema() == es.get_schema()

assert e.is_entity()

# Use an existing schema
s = kd.named_schema('Point', x=kd.INT32, y=kd.INT32)
e = kd.new(x=1, y=2, schema=s)
Expand Down Expand Up @@ -222,82 +224,6 @@ nested = nested.updated(kd.attrs(nested.a.c, e=4),

<section>

### Objects

Objects can be thought as Python objects. They directly store their own schema
as **schema** attribute similar to how Python objects store **class** attribute.
This allows objects in a DataSlice to have different schemas.

```py
o = kd.obj(x=1, y=2)
os = kd.obj(x=kd.slice([1, 2, None]),
y=kd.slice([4, None, 6]))

os = kd.slice([kd.obj(x=1),
kd.obj(y=2.0),
kd.obj(x=1.0, y='a')])

os.get_schema() # kd.OBJECT
os.get_obj_schema()
# [IMPLICIT_SCHEMA(x=INT32),
# IMPLICIT_SCHEMA(y=FLOAT32),
# IMPLICIT_SCHEMA(x=INT32, y=STRING)]

# Use provided itemids
itemid = kd.new_itemid()
o1 = kd.obj(x=1, y=2, itemid=itemid)
o2 = kd.obj(x=1, y=2, itemid=itemid)
assert o1.get_itemid() == o2.get_itemid()

# Get available attributes
os1 = kd.slice([kd.obj(x=1), kd.obj(x=1.0, y='a')])
# Attributes present in all objects
kd.dir(os1) # ['x']
# Or
os1.get_attr_names(intersection=True) # ['x']
os1.get_attr_names(intersection=False) # ['x', 'y']

# Access attribute
o.x # 1
o.get_attr('y') # 2
o.maybe('z') # None
o.get_attr('z', default=0) # 0
os.get_attr('x', default=0) # [1, 0, 'a']

# Objects are immutable by default, modification is done
# by creating a new object with updated attributes
o = kd.obj(x=1, y=2)

# Update a single attribute
o1 = o.with_attr('x', 3)
o1 = o.with_attr('z', 4)
# Also override schema
# no update_schema=True is needed
o1 = o.with_attr('y', 'a')

# Update multiple attributes
o2 = o.with_attrs(z=4, x=3)
# Also override schema for 'y'
o2 = o.with_attrs(z=4, y='a')

# Create an update and apply it separately
upd = kd.attrs(o, z=4, y=10)
o3 = o.updated(upd)

# Allows mixing multiple updates
o4 = o.updated(kd.attrs(o, z=4), kd.attrs(o, y=10))

# Update nested attributes
nested = kd.obj(a=kd.obj(c=kd.obj(e=1), d=2), b=3)
nested = nested.updated(kd.attrs(nested.a.c, e=4),
kd.attrs(nested.a, d=5),
kd.attrs(nested, b=6))
```

</section>

<section>

### Lists

```py
Expand Down Expand Up @@ -408,6 +334,111 @@ d7 = d1.updated(d1.dict_update('c', 5),

<section>

### Objects

Objects can be thought of as Python objects. They directly store their own schema
as **schema** attribute similar to how Python objects store **class** attribute.
This allows objects in a DataSlice to have different schemas. Entities, Lists,
Dicts and primitives can be objects. Entities, Lists and Dicts store their own
schema as an internal `__schema__` attribute while primitives' schema is
determined by the type of their value.

```py
# Entity objects
o = kd.obj(x=1, y=2)
os = kd.obj(x=kd.slice([1, 2, None]),
y=kd.slice([4, None, 6]))

os = kd.slice([kd.obj(x=1),
kd.obj(y=2.0),
kd.obj(x=1.0, y='a')])

os.get_schema() # kd.OBJECT
os.get_obj_schema()
# [IMPLICIT_SCHEMA(x=INT32),
# IMPLICIT_SCHEMA(y=FLOAT32),
# IMPLICIT_SCHEMA(x=INT32, y=STRING)]

# Use provided itemids
itemid = kd.new_itemid()
o1 = kd.obj(x=1, y=2, itemid=itemid)
o2 = kd.obj(x=1, y=2, itemid=itemid)
assert o1.get_itemid() == o2.get_itemid()

# Get available attributes
os1 = kd.slice([kd.obj(x=1), kd.obj(x=1.0, y='a')])
# Attributes present in all objects
kd.dir(os1) # ['x']
# Or
os1.get_attr_names(intersection=True) # ['x']
os1.get_attr_names(intersection=False) # ['x', 'y']

# Access attribute
o.x # 1
o.get_attr('y') # 2
o.maybe('z') # None
o.get_attr('z', default=0) # 0
os.get_attr('x', default=0) # [1, 0, 'a']

# Objects are immutable by default, modification is done
# by creating a new object with updated attributes
o = kd.obj(x=1, y=2)

# Update a single attribute
o1 = o.with_attr('x', 3)
o1 = o.with_attr('z', 4)
# Also override schema
# no update_schema=True is needed
o1 = o.with_attr('y', 'a')

# Update multiple attributes
o2 = o.with_attrs(z=4, x=3)
# Also override schema for 'y'
o2 = o.with_attrs(z=4, y='a')

# Create an update and apply it separately
upd = kd.attrs(o, z=4, y=10)
o3 = o.updated(upd)

# Allows mixing multiple updates
o4 = o.updated(kd.attrs(o, z=4), kd.attrs(o, y=10))

# Update nested attributes
nested = kd.obj(a=kd.obj(c=kd.obj(e=1), d=2), b=3)
nested = nested.updated(kd.attrs(nested.a.c, e=4),
kd.attrs(nested.a, d=5),
kd.attrs(nested, b=6))

# List and dict can be objects too
# To convert a list/dict to an object,
# use kd.obj()
l = kd.list([1, 2, 3])
l_obj = kd.obj(l)
l_obj[:] # [1, 2, 3]

d = kd.dict({'a': 1, 'b': 2})
d_obj = kd.obj(d)
d_obj.get_keys() # ['a', 'b']
d_obj['a'] # 1

# Convert an entity to an object
e = kd.new(x=1, y=2)
e_obj = kd.obj(e)

# Actually, we can pass primitive to kd.obj()
p_obj = kd.obj(1)
p_obj = kd.obj('a')

# An OBJECT Dataslice with entity, list,
# dict and primitive items
kd.slice([kd.obj(a=1), 1, kd.obj(kd.list([1, 2])),
kd.obj(kd.dict({'a': 1}))])
```

</section>

<section>

### Subslicing DataSlices

Subslicing is an operation of getting part of the items in a DataSlice.
Expand Down Expand Up @@ -821,6 +852,8 @@ kd.has_not(a) # [missing, present, missing]
b = kd.slice([kd.obj(), kd.obj(kd.list()),
kd.obj(kd.dict()), None, 1])

kd.has_entity(b)
# -> [present, missing, missing, missing, missing]
kd.has_list(b)
# -> [missing, present, missing, missing, missing]
kd.has_dict(b)
Expand Down Expand Up @@ -1576,6 +1609,10 @@ Line = kd.named_schema('Line', start=Point, end=kd.ANY)
# Get the attribute start's schema
Line.start

# Check if it is an Entity schema
assert Point.is_entity_schema()
assert Line.is_entity_schema()

# List schema
ls1 = kd.list_schema(kd.INT64)

Expand Down Expand Up @@ -1611,6 +1648,9 @@ uus1 = kd.uu_schema(x=kd.INT32, y=kd.FLOAT64)
uus2 = kd.uu_schema(x=kd.INT32, y=kd.FLOAT64)
assert uus1 == uus2

# It is also an Entity schema
assert uus1.is_entity_schema()

# In fact, named, list and dict schemas are also
# UU schemas
Point1 = kd.named_schema('Point', x=kd.INT32, y=kd.FLOAT64)
Expand Down
1 change: 1 addition & 0 deletions koladata/operators/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ cc_library(
"@com_google_absl//absl/types:optional",
"@com_google_absl//absl/types:span",
"@com_google_arolla//arolla/dense_array",
"@com_google_arolla//arolla/dense_array/ops",
"@com_google_arolla//arolla/dense_array/qtype",
"@com_google_arolla//arolla/expr",
"@com_google_arolla//arolla/jagged_shape/dense_array/qtype",
Expand Down
2 changes: 2 additions & 0 deletions koladata/operators/operators.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ OPERATOR_FAMILY("kde.core.enriched",
OPERATOR("kde.core.follow", Follow);
OPERATOR("kde.core.freeze_bag", Freeze<DataSlice>);
OPERATOR("kde.core.get_bag", GetBag);
OPERATOR("kde.core.has_entity", HasEntity);
OPERATOR("kde.core.has_primitive", HasPrimitive);
OPERATOR("kde.core.is_entity", IsEntity);
OPERATOR("kde.core.is_primitive", IsPrimitive);
OPERATOR("kde.core.no_bag", NoBag);
OPERATOR("kde.core.nofollow", NoFollow);
Expand Down
81 changes: 63 additions & 18 deletions koladata/operators/predicates.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include "koladata/operators/masking.h"
#include "koladata/operators/utils.h"
#include "arolla/dense_array/dense_array.h"
#include "arolla/dense_array/ops/dense_ops.h"
#include "arolla/memory/optional_value.h"
#include "arolla/util/unit.h"
#include "arolla/util/view_types.h"
#include "arolla/util/status_macros_backport.h"
Expand Down Expand Up @@ -60,6 +62,30 @@ absl::StatusOr<internal::DataSliceImpl> HasPrimitiveImpl(
return std::move(builder).Build();
}

absl::StatusOr<internal::DataItem> HasEntityImpl(
const internal::DataItem& item) {
if (item.is_entity()) {
return internal::DataItem(arolla::Unit());
} else {
return internal::DataItem();
}
}

absl::StatusOr<internal::DataSliceImpl> HasEntityImpl(
const internal::DataSliceImpl& slice) {
auto result = arolla::CreateEmptyDenseArray<arolla::Unit>(slice.size());
slice.VisitValues([&]<class T>(const arolla::DenseArray<T>& values) {
if constexpr (std::is_same_v<T, internal::ObjectId>) {
result = arolla::CreateDenseOp(
[](arolla::view_type_t<internal::ObjectId> value)
-> arolla::OptionalValue<arolla::Unit> {
return arolla::OptionalUnit(value.IsEntity());
})(values);
}
});
return internal::DataSliceImpl::Create(std::move(result));
}

absl::StatusOr<internal::DataItem> HasListImpl(const internal::DataItem& item) {
if (item.is_list()) {
return internal::DataItem(arolla::Unit());
Expand All @@ -70,19 +96,17 @@ absl::StatusOr<internal::DataItem> HasListImpl(const internal::DataItem& item) {

absl::StatusOr<internal::DataSliceImpl> HasListImpl(
const internal::DataSliceImpl& slice) {
internal::SliceBuilder builder(slice.size());
auto typed_builder = builder.typed<arolla::Unit>();
auto result = arolla::CreateEmptyDenseArray<arolla::Unit>(slice.size());
slice.VisitValues([&]<class T>(const arolla::DenseArray<T>& values) {
if constexpr (std::is_same_v<T, internal::ObjectId>) {
values.ForEachPresent(
[&](int64_t id, arolla::view_type_t<internal::ObjectId> value) {
if (value.IsList()) {
typed_builder.InsertIfNotSet(id, arolla::Unit());
}
});
result = arolla::CreateDenseOp(
[](arolla::view_type_t<internal::ObjectId> value)
-> arolla::OptionalValue<arolla::Unit> {
return arolla::OptionalUnit(value.IsList());
})(values);
}
});
return std::move(builder).Build();
return internal::DataSliceImpl::Create(std::move(result));
}

absl::StatusOr<internal::DataItem> HasDictImpl(const internal::DataItem& item) {
Expand All @@ -95,19 +119,17 @@ absl::StatusOr<internal::DataItem> HasDictImpl(const internal::DataItem& item) {

absl::StatusOr<internal::DataSliceImpl> HasDictImpl(
const internal::DataSliceImpl& slice) {
internal::SliceBuilder builder(slice.size());
auto typed_builder = builder.typed<arolla::Unit>();
auto result = arolla::CreateEmptyDenseArray<arolla::Unit>(slice.size());
slice.VisitValues([&]<class T>(const arolla::DenseArray<T>& values) {
if constexpr (std::is_same_v<T, internal::ObjectId>) {
values.ForEachPresent(
[&](int64_t id, arolla::view_type_t<internal::ObjectId> value) {
if (value.IsDict()) {
typed_builder.InsertIfNotSet(id, arolla::Unit());
}
});
result = arolla::CreateDenseOp(
[](arolla::view_type_t<internal::ObjectId> value)
-> arolla::OptionalValue<arolla::Unit> {
return arolla::OptionalUnit(value.IsDict());
})(values);
}
});
return std::move(builder).Build();
return internal::DataSliceImpl::Create(std::move(result));
}

} // namespace
Expand All @@ -134,6 +156,25 @@ absl::StatusOr<DataSlice> HasPrimitive(const DataSlice& x) {
x.GetShape(), internal::DataItem(schema::kMask), nullptr);
}

absl::StatusOr<DataSlice> HasEntity(const DataSlice& x) {
auto schema = x.GetSchemaImpl();
// Trust the schema if it is a Entity schema.
if (x.GetSchema().IsEntitySchema()) {
return Has(x);
}
// Derive from the data for OBJECT and ANY schemas.
if (schema.is_any_schema() || schema.is_object_schema()) {
return x.VisitImpl([&](const auto& impl) -> absl::StatusOr<DataSlice> {
ASSIGN_OR_RETURN(auto res, HasEntityImpl(impl));
return DataSlice::Create(std::move(res), x.GetShape(),
internal::DataItem(schema::kMask), nullptr);
});
}
return DataSlice::Create(
internal::DataSliceImpl::CreateEmptyAndUnknownType(x.size()),
x.GetShape(), internal::DataItem(schema::kMask), nullptr);
}

absl::StatusOr<DataSlice> HasList(const DataSlice& x) {
auto schema = x.GetSchemaImpl();
// Trust the schema if it is a List schema.
Expand Down Expand Up @@ -202,6 +243,10 @@ absl::StatusOr<DataSlice> IsPrimitive(const DataSlice& x) {
return AsMask(contains_only_primitives);
}

absl::StatusOr<DataSlice> IsEntity(const DataSlice& x) {
return AsMask(x.IsEntity());
}

absl::StatusOr<DataSlice> IsList(const DataSlice& x) {
return AsMask(x.IsList());
}
Expand Down
Loading

0 comments on commit b382a76

Please sign in to comment.