-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FastADC implementation #470
base: main
Are you sure you want to change the base?
Changes from all commits
451c76f
82f8437
3c50e82
66c13dd
f348335
9e057cf
fd64797
e6d2fc3
7753aed
effa8ba
c99e02c
623ab63
7245877
eded4d8
e1ed44c
8d5edc1
3943406
fbb6a86
fc63418
37e1136
68b990b
1a4a57c
dee8405
4decf6e
e200ba1
495ae73
e9c9528
148e24c
570c804
3d05336
e1f6f38
e3804a8
860fafd
0409803
eca4790
f774cc2
de78c4d
8c26f9a
bc1be2e
b87850b
8a0d876
d15ae01
1827d3d
b6a124d
c9302c8
8ff5492
036970f
33cddfe
c61f461
cc3c076
69ae4e8
2318ca4
65d930e
00d8041
cec11af
cfc5039
2ac5184
221ae1b
80dcae1
b75cfe7
f6c3022
7aa1f06
a5c5b19
fd91034
6ed4ac7
7ec6be3
ab25724
2879af7
b41fdff
a562274
c424a3f
9d8d389
8ce8fab
c02e8ab
f98e9e1
36dfea9
d0c362d
26bfbd6
0244e4d
2ba381e
12b2881
11aedfe
88a7b1f
cdd22ed
5e9835e
9f2a1d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import desbordante as db | ||
import pandas as pd | ||
|
||
RED = '\033[31m' | ||
YELLOW = '\033[33m' | ||
GREEN = '\033[32m' | ||
CYAN = '\033[1m\033[36m' | ||
ENDC = '\033[0m' | ||
|
||
TABLE_1 = "examples/datasets/taxes.csv" | ||
TABLE_2 = "examples/datasets/taxes_2.csv" | ||
|
||
def print_table(filename: str, title: str = "") -> None: | ||
if title: | ||
print(f"{title}") | ||
data = pd.read_csv(filename, header=0) | ||
print(data, end="\n\n") | ||
|
||
def main(): | ||
print(f"""{YELLOW}This file demonstrates how to discover Approximate Denial Constraints (ADCs){ENDC}. | ||
|
||
DC {CYAN}φ{ENDC} is a conjunction of predicates of the following form: | ||
{CYAN}∀s, t ∈ R, s ≠ t: ¬(p_1 ∧ . . . ∧ p_m){ENDC} | ||
|
||
DCs involve comparisons between pairs of rows within a dataset. | ||
A typical DC example, derived from a Functional Dependency such as {CYAN}A -> B{ENDC}, | ||
is expressed as: {CYAN}∀s, t ∈ R, s ≠ t, ¬(t.A == s.A and t.B ≠ s.B){ENDC}. | ||
This denotes that for any pair of rows in the relation, it should not be the case | ||
that while the values in column A are equal, the values in column B are unequal. | ||
|
||
{YELLOW}Let's begin by looking at TABLE_1:{ENDC} | ||
""") | ||
|
||
print_table(TABLE_1, "TABLE_1 (examples/datasets/taxes.csv):") | ||
|
||
print(f"""- The 'evidence_threshold' parameter specifies the fraction of row pairs that must satisfy the DC for it to be considered valid. | ||
* evidence_threshold = 0 => exact DC mining, where all pairs must satisfy. | ||
* evidence_threshold < 1.0 => approximate DC mining, which allows a fraction of violations. | ||
- The 'shard_length' parameter splits the dataset into row "shards" for parallelization. Here, | ||
we set it to 0 so all rows are processed in one shard. | ||
""") | ||
|
||
print(f"""{YELLOW}Mining exact DCs (evidence_threshold=0) on TABLE_1{ENDC}""") | ||
|
||
algo = db.dc.algorithms.Default() | ||
algo.load_data(table=(TABLE_1, ',', True)) | ||
algo.execute(evidence_threshold=0, shard_length=0) | ||
dcs_table1_exact = algo.get_dcs() | ||
|
||
print(f"{YELLOW}Discovered DCs:{ENDC}") | ||
for dc in dcs_table1_exact: | ||
print(f" {CYAN}{dc}{ENDC}") | ||
print() | ||
|
||
print(f"""Note the following Denial Constraint: {GREEN}¬{{ t.State == s.State ∧ t.Salary <= s.Salary ∧ t.FedTaxRate >= s.FedTaxRate }}{ENDC}. | ||
It tells us that for all people in the same state the person with a higher salary has a higher tax rate. | ||
""") | ||
|
||
print(f"""{YELLOW}Now let's lower the evidence_threshold to 0.5 on TABLE_1{ENDC} | ||
This means the DC only needs to hold for at least half of the row pairs, thus allowing more approximate constraints. | ||
""") | ||
|
||
print(f"""{YELLOW}Mining ADCs (evidence_threshold=0.5) on TABLE_1{ENDC}""") | ||
|
||
algo = db.dc.algorithms.Default() | ||
algo.load_data(table=(TABLE_1, ',', True)) | ||
algo.execute(evidence_threshold=0.5, shard_length=0) | ||
dcs_table1_approx = algo.get_dcs() | ||
|
||
print(f"{YELLOW}Discovered ADCs:{ENDC}") | ||
for dc in dcs_table1_approx: | ||
print(f" {CYAN}{dc}{ENDC}") | ||
print() | ||
|
||
print(f"""{YELLOW}Let's take a look at TABLE_2:{ENDC}""") | ||
|
||
print_table(TABLE_2, "TABLE_2 (examples/datasets/taxes_2.csv):") | ||
|
||
|
||
print(f"""TABLE_2 is almost the same as TABLE_1, but we added a new record for Texas: | ||
{GREEN}(State=Texas, Salary=5000, FedTaxRate=0.05){ENDC}. | ||
|
||
This additional record violates one of the DCs that was valid in TABLE_1, | ||
because it introduces a new pair of rows that breaks the constraint | ||
""") | ||
|
||
print(f"""{YELLOW}Mining exact DCs (evidence_threshold=0) on TABLE_2{ENDC}""") | ||
|
||
algo = db.dc.algorithms.Default() | ||
algo.load_data(table=(TABLE_2, ',', True)) | ||
algo.execute(evidence_threshold=0, shard_length=0) | ||
dcs_table2_exact = algo.get_dcs() | ||
|
||
print(f"{YELLOW}Discovered DCs:{ENDC}") | ||
for dc in dcs_table2_exact: | ||
print(f" {CYAN}{dc}{ENDC}") | ||
print() | ||
|
||
print(f"""Now we can see that the same DC we examined on the previous dataset doesn't hold on the new one. | ||
The thing is that for the last record {GREEN}(Texas, 5000, 0.05){ENDC} there are people in Texas with a lower salary | ||
but higher tax rate. Pairs of records like this that contradict a DC are called violations. | ||
In this case the following pairs are violations: {RED}(6, 9), (7, 9), (8, 9){ENDC}, where each number is an index of a record. | ||
""") | ||
|
||
if __name__ == "__main__": | ||
main() | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,147 @@ | ||||||||||||||||||||
#include "algorithms/dc/FastADC/fastadc.h" | ||||||||||||||||||||
|
||||||||||||||||||||
#include <easylogging++.h> | ||||||||||||||||||||
|
||||||||||||||||||||
#include "config/names_and_descriptions.h" | ||||||||||||||||||||
#include "config/option.h" | ||||||||||||||||||||
#include "config/option_using.h" | ||||||||||||||||||||
#include "config/tabular_data/input_table/option.h" | ||||||||||||||||||||
#include "dc/FastADC/model/pli_shard.h" | ||||||||||||||||||||
#include "dc/FastADC/util/approximate_evidence_inverter.h" | ||||||||||||||||||||
#include "dc/FastADC/util/evidence_aux_structures_builder.h" | ||||||||||||||||||||
#include "dc/FastADC/util/evidence_set_builder.h" | ||||||||||||||||||||
#include "dc/FastADC/util/predicate_builder.h" | ||||||||||||||||||||
#include "descriptions.h" | ||||||||||||||||||||
#include "names.h" | ||||||||||||||||||||
|
||||||||||||||||||||
namespace algos::dc { | ||||||||||||||||||||
|
||||||||||||||||||||
FastADC::FastADC() : Algorithm({}) { | ||||||||||||||||||||
RegisterOptions(); | ||||||||||||||||||||
MakeOptionsAvailable({config::kTableOpt.GetName()}); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
void FastADC::RegisterOptions() { | ||||||||||||||||||||
DESBORDANTE_OPTION_USING; | ||||||||||||||||||||
|
||||||||||||||||||||
config::InputTable default_table; | ||||||||||||||||||||
|
||||||||||||||||||||
RegisterOption(config::kTableOpt(&input_table_)); | ||||||||||||||||||||
RegisterOption(Option{&shard_length_, kShardLength, kDShardLength, 350U}); | ||||||||||||||||||||
RegisterOption(Option{&allow_cross_columns_, kAllowCrossColumns, kDAllowCrossColumns, true}); | ||||||||||||||||||||
RegisterOption(Option{&minimum_shared_value_, kMinimumSharedValue, kDMinimumSharedValue, 0.3}); | ||||||||||||||||||||
RegisterOption( | ||||||||||||||||||||
Option{&comparable_threshold_, kComparableThreshold, kDComparableThreshold, 0.1}); | ||||||||||||||||||||
RegisterOption(Option{&evidence_threshold_, kEvidenceThreshold, kDEvidenceThreshold, 0.01}); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
void FastADC::MakeExecuteOptsAvailable() { | ||||||||||||||||||||
using namespace config::names; | ||||||||||||||||||||
|
||||||||||||||||||||
MakeOptionsAvailable({kShardLength, kAllowCrossColumns, kMinimumSharedValue, | ||||||||||||||||||||
kComparableThreshold, kEvidenceThreshold}); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
void FastADC::LoadDataInternal() { | ||||||||||||||||||||
typed_relation_ = model::ColumnLayoutTypedRelationData::CreateFrom( | ||||||||||||||||||||
*input_table_, true, true); // kMixed type will be treated as a string type | ||||||||||||||||||||
|
||||||||||||||||||||
if (typed_relation_->GetColumnData().empty()) { | ||||||||||||||||||||
throw std::runtime_error("Got an empty dataset: DC mining is meaningless."); | ||||||||||||||||||||
} | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
void FastADC::SetLimits() { | ||||||||||||||||||||
unsigned all_rows_num = typed_relation_->GetNumRows(); | ||||||||||||||||||||
|
||||||||||||||||||||
if (shard_length_ > all_rows_num) { | ||||||||||||||||||||
throw std::invalid_argument( | ||||||||||||||||||||
"'shard_length' (" + std::to_string(shard_length_) + | ||||||||||||||||||||
") must be less or equal to the number of rows in the table (total " | ||||||||||||||||||||
"rows: " + | ||||||||||||||||||||
std::to_string(all_rows_num) + ")"); | ||||||||||||||||||||
} | ||||||||||||||||||||
if (shard_length_ == 0) shard_length_ = all_rows_num; | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
void FastADC::CheckTypes() { | ||||||||||||||||||||
model::ColumnIndex columns_num = typed_relation_->GetNumColumns(); | ||||||||||||||||||||
unsigned rows_num = typed_relation_->GetNumRows(); | ||||||||||||||||||||
|
||||||||||||||||||||
for (model::ColumnIndex column_index = 0; column_index < columns_num; column_index++) { | ||||||||||||||||||||
model::TypedColumnData const& column = typed_relation_->GetColumnData(column_index); | ||||||||||||||||||||
model::TypeId type_id = column.GetTypeId(); | ||||||||||||||||||||
|
||||||||||||||||||||
if (type_id == +model::TypeId::kMixed) { | ||||||||||||||||||||
LOG(WARNING) << "Column with index \"" + std::to_string(column_index) + | ||||||||||||||||||||
"\" contains values of different types. Those values will be " | ||||||||||||||||||||
"treated as strings."; | ||||||||||||||||||||
} else if (!column.IsNumeric() && type_id != +model::TypeId::kString) { | ||||||||||||||||||||
throw std::invalid_argument( | ||||||||||||||||||||
"Column with index \"" + std::to_string(column_index) + | ||||||||||||||||||||
"\" is of unsupported type. Only numeric and string types are supported."); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
for (std::size_t row_index = 0; row_index < rows_num; row_index++) { | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
if (column.IsNull(row_index)) { | ||||||||||||||||||||
throw std::runtime_error("Some of the value coordinates are nulls."); | ||||||||||||||||||||
} | ||||||||||||||||||||
if (column.IsEmpty(row_index)) { | ||||||||||||||||||||
throw std::runtime_error("Some of the value coordinates are empty."); | ||||||||||||||||||||
} | ||||||||||||||||||||
Comment on lines
+86
to
+91
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's not important you may combine with
Suggested change
|
||||||||||||||||||||
} | ||||||||||||||||||||
} | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
void FastADC::PrintResults() { | ||||||||||||||||||||
LOG(INFO) << "Total denial constraints: " << dcs_.TotalDCSize(); | ||||||||||||||||||||
LOG(INFO) << "Minimal denial constraints: " << dcs_.MinDCSize(); | ||||||||||||||||||||
LOG(DEBUG) << dcs_.ToString(); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
unsigned long long FastADC::ExecuteInternal() { | ||||||||||||||||||||
auto const start_time = std::chrono::system_clock::now(); | ||||||||||||||||||||
LOG(DEBUG) << "Start"; | ||||||||||||||||||||
|
||||||||||||||||||||
SetLimits(); | ||||||||||||||||||||
CheckTypes(); | ||||||||||||||||||||
|
||||||||||||||||||||
PredicateBuilder predicate_builder(&pred_provider_, &pred_index_provider_, allow_cross_columns_, | ||||||||||||||||||||
minimum_shared_value_, comparable_threshold_); | ||||||||||||||||||||
predicate_builder.BuildPredicateSpace(typed_relation_->GetColumnData()); | ||||||||||||||||||||
|
||||||||||||||||||||
PliShardBuilder pli_shard_builder(&int_prov_, &double_prov_, &string_prov_, shard_length_); | ||||||||||||||||||||
pli_shard_builder.BuildPliShards(typed_relation_->GetColumnData()); | ||||||||||||||||||||
|
||||||||||||||||||||
EvidenceAuxStructuresBuilder evidence_aux_structures_builder(predicate_builder); | ||||||||||||||||||||
evidence_aux_structures_builder.BuildAll(); | ||||||||||||||||||||
|
||||||||||||||||||||
EvidenceSetBuilder evidence_set_builder(pli_shard_builder.pli_shards, | ||||||||||||||||||||
evidence_aux_structures_builder.GetPredicatePacks()); | ||||||||||||||||||||
evidence_set_builder.BuildEvidenceSet(evidence_aux_structures_builder.GetCorrectionMap(), | ||||||||||||||||||||
evidence_aux_structures_builder.GetCardinalityMask()); | ||||||||||||||||||||
|
||||||||||||||||||||
LOG(INFO) << "Built evidence set"; | ||||||||||||||||||||
auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>( | ||||||||||||||||||||
std::chrono::system_clock::now() - start_time); | ||||||||||||||||||||
LOG(DEBUG) << "Current time: " << elapsed_milliseconds.count(); | ||||||||||||||||||||
|
||||||||||||||||||||
ApproxEvidenceInverter dcbuilder(predicate_builder, evidence_threshold_, | ||||||||||||||||||||
std::move(evidence_set_builder.evidence_set)); | ||||||||||||||||||||
|
||||||||||||||||||||
dcs_ = dcbuilder.BuildDenialConstraints(); | ||||||||||||||||||||
|
||||||||||||||||||||
PrintResults(); | ||||||||||||||||||||
|
||||||||||||||||||||
elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>( | ||||||||||||||||||||
std::chrono::system_clock::now() - start_time); | ||||||||||||||||||||
LOG(INFO) << "Algorithm time: " << elapsed_milliseconds.count(); | ||||||||||||||||||||
return elapsed_milliseconds.count(); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
// TODO: mb make this a list? | ||||||||||||||||||||
std::vector<DenialConstraint> const& FastADC::GetDCs() const { | ||||||||||||||||||||
return dcs_.GetResult(); | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
} // namespace algos::dc |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#pragma once | ||
|
||
#include <memory> | ||
#include <vector> | ||
|
||
#include "algorithm.h" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use full name |
||
#include "dc/FastADC/providers/predicate_provider.h" | ||
#include "dc/FastADC/util/denial_constraint_set.h" | ||
#include "model/denial_constraint.h" | ||
#include "table/column_layout_typed_relation_data.h" | ||
#include "tabular_data/input_table_type.h" | ||
|
||
namespace algos::dc { | ||
|
||
using namespace fastadc; | ||
|
||
class FastADC : public Algorithm { | ||
private: | ||
unsigned shard_length_; | ||
bool allow_cross_columns_; | ||
double minimum_shared_value_; | ||
double comparable_threshold_; | ||
double evidence_threshold_; | ||
|
||
config::InputTable input_table_; | ||
std::unique_ptr<model::ColumnLayoutTypedRelationData> typed_relation_; | ||
|
||
PredicateIndexProvider pred_index_provider_; | ||
PredicateProvider pred_provider_; | ||
IntIndexProvider int_prov_; | ||
DoubleIndexProvider double_prov_; | ||
StringIndexProvider string_prov_; | ||
DenialConstraintSet dcs_; | ||
|
||
void MakeExecuteOptsAvailable() override; | ||
void LoadDataInternal() override; | ||
|
||
void SetLimits(); | ||
void CheckTypes(); | ||
void PrintResults(); | ||
|
||
void ResetState() final { | ||
pred_index_provider_.Clear(); | ||
pred_provider_.Clear(); | ||
int_prov_.Clear(); | ||
double_prov_.Clear(); | ||
string_prov_.Clear(); | ||
dcs_.Clear(); | ||
} | ||
|
||
unsigned long long ExecuteInternal() final; | ||
|
||
void RegisterOptions(); | ||
|
||
public: | ||
FastADC(); | ||
|
||
std::vector<DenialConstraint> const& GetDCs() const; | ||
}; | ||
|
||
} // namespace algos::dc |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,53 @@ | ||||||
#pragma once | ||||||
|
||||||
#include "model/table/typed_column_data.h" | ||||||
|
||||||
namespace algos::fastadc { | ||||||
|
||||||
namespace details { | ||||||
// Helper to trigger a compile-time error for unsupported types | ||||||
template <typename T> | ||||||
struct DependentFalse : std::false_type {}; | ||||||
} // namespace details | ||||||
|
||||||
// TODO: look at performance, is returning by const reference here beneficial? | ||||||
template <typename T> | ||||||
[[nodiscard]] T GetValue(model::TypedColumnData const& column, size_t row) { | ||||||
model::Type const& type = column.GetType(); | ||||||
|
||||||
if (!column.IsNullOrEmpty(row)) { | ||||||
return type.GetValue<T>(column.GetValue(row)); | ||||||
} | ||||||
|
||||||
/* | ||||||
* Mimicking the Java behavior: | ||||||
* https://github.com/RangerShaw/FastADC/blob/master/src/main/java/de/metanome/algorithms/dcfinder/input/Column.java#L71 | ||||||
* | ||||||
* public Long getLong(int line) { | ||||||
* return values.get(line).isEmpty() ? Long.MIN_VALUE : | ||||||
* Long.parseLong(values.get(line)); | ||||||
* } | ||||||
* | ||||||
* public Double getDouble(int line) { | ||||||
* return values.get(line).isEmpty() ? Double.MIN_VALUE : | ||||||
* Double.parseDouble(values.get(line)); | ||||||
* } | ||||||
* | ||||||
* public String getString(int line) { | ||||||
* return values.get(line) == null ? "" : values.get(line); | ||||||
* } | ||||||
*/ | ||||||
if constexpr (std::is_same_v<T, std::string>) { | ||||||
return ""; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} else if constexpr (std::is_same_v<T, int64_t>) { | ||||||
return std::numeric_limits<int64_t>::min(); | ||||||
} else if constexpr (std::is_same_v<T, double>) { | ||||||
return std::numeric_limits<double>::lowest(); | ||||||
} else { | ||||||
static_assert(details::DependentFalse<T>::value, | ||||||
"FastADC algorithm supports only int64_t, string, or double as column types. " | ||||||
"This function should not be called with other types."); | ||||||
} | ||||||
} | ||||||
|
||||||
} // namespace algos::fastadc |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.