Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement separator validation #326

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions src/core/parser/csv_parser/csv_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
#include <cstddef>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -180,3 +182,116 @@ std::vector<std::string> CSVParser::GetNextRow() {

return result;
}

std::optional<char> CSVParser::DeduceSeparator() {
// Calculate statistics including the header row
bool has_header_copy = has_header_;
has_header_ = false;
Reset();
has_header_ = has_header_copy;

std::unordered_map<char, unsigned> letter_count;
bool is_quoted;
if (has_next_) {
is_quoted = false;
for (char c : next_line_) {
if (c == quote_) {
is_quoted = !is_quoted;
} else if (!is_quoted) {
letter_count[c]++;
}
}
}

std::unordered_map<char, unsigned> next_letter_count;
while (has_next_) {
GetNextIfHas();
next_letter_count.clear();
is_quoted = false;
for (char c : next_line_) {
if (c == quote_) {
is_quoted = !is_quoted;
} else if (!is_quoted) {
next_letter_count[c]++;
}
}
for (auto letter : letter_count) {
if (letter.second != next_letter_count[letter.first]) {
letter_count[letter.first] = 0;
}
}
}

char possible_separator;
unsigned max_separator_count = 0;

for (auto letter : letter_count) {
if (letter.second > max_separator_count) {
max_separator_count = letter.second;
possible_separator = letter.first;
}
}
Reset();

if (max_separator_count) {
return possible_separator;
}

return std::nullopt;
}

bool CSVParser::CheckSeparator(char sep) {
// Calculate statistics including the header row
bool has_header_copy = has_header_;
has_header_ = false;
Reset();
has_header_ = has_header_copy;

char separator_copy = separator_;
separator_ = sep;

unsigned sep_count = 0;
std::vector<std::string> next_parsed;
if (has_next_) {
next_parsed = GetNextRow();
sep_count = next_parsed.size();
}

while (has_next_) {
next_parsed = GetNextRow();
if (sep_count != next_parsed.size()) {
Reset();
separator_ = separator_copy;
return false;
}
}

Reset();
separator_ = separator_copy;

return true;
}

std::pair<std::optional<char>, std::string> CSVParser::ValidateSeparator() {
std::optional<char> possible_separator = DeduceSeparator();

std::stringstream s;
if (CheckSeparator(separator_)) {
if (possible_separator == std::nullopt || separator_ == possible_separator ||
GetNumberOfColumns() != 1 || !CheckSeparator(possible_separator.value())) {
return {separator_, ""};
}

s << "Inserted separator for the table " << relation_name_ << " seems to be wrong\n";
s << "Possible separator for the table is: \'" << possible_separator.value() << "\'";
return {possible_separator, s.str()};
}

s << "Inserted separator for the table " << relation_name_ << " seems to be wrong";
if (possible_separator != std::nullopt && CheckSeparator(possible_separator.value())) {
s << "\nPossible separator for the table is: \'" << possible_separator.value() << "\'";
return {possible_separator, s.str()};
}

return {std::nullopt, s.str()};
}
5 changes: 5 additions & 0 deletions src/core/parser/csv_parser/csv_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <filesystem>
#include <fstream>
#include <optional>
#include <string>
#include <vector>

Expand Down Expand Up @@ -36,6 +37,8 @@ class CSVParser : public model::IDatasetStream {
std::vector<std::string> ParseString(std::string const& s) const;
void GetNextIfHas();
void SkipLine();
std::optional<char> DeduceSeparator();
bool CheckSeparator(char sep);

inline static std::string& Rtrim(std::string& s);

Expand All @@ -49,6 +52,8 @@ class CSVParser : public model::IDatasetStream {
std::string GetUnparsedLine(unsigned long long const line_index);
std::vector<std::string> ParseLine(unsigned long long const line_index);

std::pair<std::optional<char>, std::string> ValidateSeparator();

bool HasNextRow() const override {
return has_next_;
}
Expand Down
11 changes: 11 additions & 0 deletions src/core/util/separator_validator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "separator_validator.h"

namespace util {

std::pair<std::optional<char>, std::string> ValidateSeparator(std::filesystem::path const& path,
char separator) {
auto parser = std::make_unique<CSVParser>(path, separator, false);
return parser->ValidateSeparator();
}

} // namespace util
14 changes: 14 additions & 0 deletions src/core/util/separator_validator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once

#include <filesystem>
#include <optional>
#include <string>

#include "parser/csv_parser/csv_parser.h"

namespace util {

std::pair<std::optional<char>, std::string> ValidateSeparator(std::filesystem::path const& path,
char separator);

} // namespace util
7 changes: 7 additions & 0 deletions src/python_bindings/bind_main_classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "py_util/get_py_type.h"
#include "py_util/opt_to_py.h"
#include "py_util/py_to_any.h"
#include "separator_validator.h"

namespace {
namespace py = pybind11;
Expand Down Expand Up @@ -42,6 +43,12 @@ void BindMainClasses(py::module_& main_module) {
py::register_exception<config::ConfigurationError>(main_module, "ConfigurationError",
PyExc_ValueError);

auto util_module = main_module.def_submodule("util");
util_module.def(
"validate_separator",
[](std::string const& path, char sep) { return util::ValidateSeparator(path, sep); },
"Validate separator for a CSV table");

#define CERTAIN_SCRIPTS_ONLY \
"\nThis option is only expected to be used by Python scripts in which it is\n" \
"easier to set all options one by one. For normal use, you may set the\n" \
Expand Down
2 changes: 2 additions & 0 deletions src/tests/all_csv_configs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,6 @@ CSVConfig const kTestDynamicFDUpdateBad3 =
CreateCsvConfig("dynamic_fd/TestDynamicUpdateBad3.csv", ',', true);
CSVConfig const kTestDynamicFDUpdateBad4 =
CreateCsvConfig("dynamic_fd/TestDynamicUpdateBad4.csv", ',', true);
CSVConfig const kTestSeparator = CreateCsvConfig("TestSeparator.csv", ',', false);
CSVConfig const kTestSeparator1 = CreateCsvConfig("TestSeparator1.csv", ',', false);
} // namespace tests
2 changes: 2 additions & 0 deletions src/tests/all_csv_configs.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,6 @@ extern CSVConfig const kTestDynamicFDUpdateBad1;
extern CSVConfig const kTestDynamicFDUpdateBad2;
extern CSVConfig const kTestDynamicFDUpdateBad3;
extern CSVConfig const kTestDynamicFDUpdateBad4;
extern CSVConfig const kTestSeparator;
extern CSVConfig const kTestSeparator1;
} // namespace tests
32 changes: 32 additions & 0 deletions src/tests/test_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "model/table/agree_set_factory.h"
#include "model/table/column_layout_relation_data.h"
#include "model/table/identifier_set.h"
#include "separator_validator.h"

namespace tests {

Expand Down Expand Up @@ -230,4 +231,35 @@ INSTANTIATE_TEST_SUITE_P(TestLevenshteinSuite, TestLevenshtein,
TestLevenshteinParam("", "book", 4),
TestLevenshteinParam("randomstring", "juststring", 6)));

struct TestSeparatorValidationParam {
CSVConfig csv_config;
char test_separator;
std::optional<char> expected_separator;
};

class TestSeparatorValidation : public ::testing::TestWithParam<TestSeparatorValidationParam> {};

TEST_P(TestSeparatorValidation, Default) {
TestSeparatorValidationParam const& p = GetParam();
std::optional<char> actual = util::ValidateSeparator(p.csv_config.path, p.test_separator).first;
EXPECT_EQ(actual, p.expected_separator);
}

INSTANTIATE_TEST_SUITE_P(TestSeparatorValidationSuite, TestSeparatorValidation,
::testing::Values(TestSeparatorValidationParam(kTest1, ',', ','),
TestSeparatorValidationParam(kTest1, ';', ';'),
TestSeparatorValidationParam(kTest1, '1', std::nullopt),
TestSeparatorValidationParam(kTestFD, ',', ','),
TestSeparatorValidationParam(kTestFD, ';', ','),
TestSeparatorValidationParam(kAdult, ';', ';'),
TestSeparatorValidationParam(kAdult, ',', ';'),
TestSeparatorValidationParam(kAbalone, ',', ','),
TestSeparatorValidationParam(kAbalone, '.', ','),
TestSeparatorValidationParam(kTestParse, ',', ','),
TestSeparatorValidationParam(kTestSeparator, ',', ','),
TestSeparatorValidationParam(kTestSeparator, ';', ','),
TestSeparatorValidationParam(kTestSeparator1, ',', ','),
TestSeparatorValidationParam(kTestSeparator1, ';',
';')));

} // namespace tests
3 changes: 3 additions & 0 deletions test_input_data/TestSeparator.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"a,;b",c
"a;","b,c"
a,b;c
3 changes: 3 additions & 0 deletions test_input_data/TestSeparator1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
,;,;,;
;,;,;,
,;,;,;
Loading