diff --git a/src/core/parser/csv_parser/csv_parser.cpp b/src/core/parser/csv_parser/csv_parser.cpp index cb6ae0b4a0..8d353bbad5 100644 --- a/src/core/parser/csv_parser/csv_parser.cpp +++ b/src/core/parser/csv_parser/csv_parser.cpp @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include #include @@ -180,3 +182,116 @@ std::vector CSVParser::GetNextRow() { return result; } + +std::optional CSVParser::DeduceSeparator() { + // Calculate statistics including the header row + bool has_header_copy = has_header_; + has_header_ = false; + Reset(); + has_header_ = has_header_copy; + + std::unordered_map letter_count; + bool is_quoted; + if (has_next_) { + is_quoted = false; + for (char c : next_line_) { + if (c == quote_) { + is_quoted = !is_quoted; + } else if (!is_quoted) { + letter_count[c]++; + } + } + } + + std::unordered_map next_letter_count; + while (has_next_) { + GetNextIfHas(); + next_letter_count.clear(); + is_quoted = false; + for (char c : next_line_) { + if (c == quote_) { + is_quoted = !is_quoted; + } else if (!is_quoted) { + next_letter_count[c]++; + } + } + for (auto letter : letter_count) { + if (letter.second != next_letter_count[letter.first]) { + letter_count[letter.first] = 0; + } + } + } + + char possible_separator; + unsigned max_separator_count = 0; + + for (auto letter : letter_count) { + if (letter.second > max_separator_count) { + max_separator_count = letter.second; + possible_separator = letter.first; + } + } + Reset(); + + if (max_separator_count) { + return possible_separator; + } + + return std::nullopt; +} + +bool CSVParser::CheckSeparator(char sep) { + // Calculate statistics including the header row + bool has_header_copy = has_header_; + has_header_ = false; + Reset(); + has_header_ = has_header_copy; + + char separator_copy = separator_; + separator_ = sep; + + unsigned sep_count = 0; + std::vector next_parsed; + if (has_next_) { + next_parsed = GetNextRow(); + sep_count = next_parsed.size(); + } + + while (has_next_) { + next_parsed = GetNextRow(); + if (sep_count != next_parsed.size()) { + Reset(); + separator_ = separator_copy; + return false; + } + } + + Reset(); + separator_ = separator_copy; + + return true; +} + +std::pair, std::string> CSVParser::ValidateSeparator() { + std::optional possible_separator = DeduceSeparator(); + + std::stringstream s; + if (CheckSeparator(separator_)) { + if (possible_separator == std::nullopt || separator_ == possible_separator || + GetNumberOfColumns() != 1 || !CheckSeparator(possible_separator.value())) { + return {separator_, ""}; + } + + s << "Inserted separator for the table " << relation_name_ << " seems to be wrong\n"; + s << "Possible separator for the table is: \'" << possible_separator.value() << "\'"; + return {possible_separator, s.str()}; + } + + s << "Inserted separator for the table " << relation_name_ << " seems to be wrong"; + if (possible_separator != std::nullopt && CheckSeparator(possible_separator.value())) { + s << "\nPossible separator for the table is: \'" << possible_separator.value() << "\'"; + return {possible_separator, s.str()}; + } + + return {std::nullopt, s.str()}; +} diff --git a/src/core/parser/csv_parser/csv_parser.h b/src/core/parser/csv_parser/csv_parser.h index b8ce70e736..5f25d318d2 100644 --- a/src/core/parser/csv_parser/csv_parser.h +++ b/src/core/parser/csv_parser/csv_parser.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -36,6 +37,8 @@ class CSVParser : public model::IDatasetStream { std::vector ParseString(std::string const& s) const; void GetNextIfHas(); void SkipLine(); + std::optional DeduceSeparator(); + bool CheckSeparator(char sep); inline static std::string& Rtrim(std::string& s); @@ -49,6 +52,8 @@ class CSVParser : public model::IDatasetStream { std::string GetUnparsedLine(unsigned long long const line_index); std::vector ParseLine(unsigned long long const line_index); + std::pair, std::string> ValidateSeparator(); + bool HasNextRow() const override { return has_next_; } diff --git a/src/core/util/separator_validator.cpp b/src/core/util/separator_validator.cpp new file mode 100644 index 0000000000..0696425d1b --- /dev/null +++ b/src/core/util/separator_validator.cpp @@ -0,0 +1,11 @@ +#include "separator_validator.h" + +namespace util { + +std::pair, std::string> ValidateSeparator(std::filesystem::path const& path, + char separator) { + auto parser = std::make_unique(path, separator, false); + return parser->ValidateSeparator(); +} + +} // namespace util diff --git a/src/core/util/separator_validator.h b/src/core/util/separator_validator.h new file mode 100644 index 0000000000..cd0c8c95d2 --- /dev/null +++ b/src/core/util/separator_validator.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include +#include + +#include "parser/csv_parser/csv_parser.h" + +namespace util { + +std::pair, std::string> ValidateSeparator(std::filesystem::path const& path, + char separator); + +} // namespace util diff --git a/src/python_bindings/bind_main_classes.cpp b/src/python_bindings/bind_main_classes.cpp index 2f3ea88a16..e7c83f6d6c 100644 --- a/src/python_bindings/bind_main_classes.cpp +++ b/src/python_bindings/bind_main_classes.cpp @@ -13,6 +13,7 @@ #include "py_util/get_py_type.h" #include "py_util/opt_to_py.h" #include "py_util/py_to_any.h" +#include "separator_validator.h" namespace { namespace py = pybind11; @@ -42,6 +43,12 @@ void BindMainClasses(py::module_& main_module) { py::register_exception(main_module, "ConfigurationError", PyExc_ValueError); + auto util_module = main_module.def_submodule("util"); + util_module.def( + "validate_separator", + [](std::string const& path, char sep) { return util::ValidateSeparator(path, sep); }, + "Validate separator for a CSV table"); + #define CERTAIN_SCRIPTS_ONLY \ "\nThis option is only expected to be used by Python scripts in which it is\n" \ "easier to set all options one by one. For normal use, you may set the\n" \ diff --git a/src/tests/all_csv_configs.cpp b/src/tests/all_csv_configs.cpp index 869a7a6ed3..b17036c902 100644 --- a/src/tests/all_csv_configs.cpp +++ b/src/tests/all_csv_configs.cpp @@ -119,4 +119,6 @@ CSVConfig const kTestDynamicFDUpdateBad3 = CreateCsvConfig("dynamic_fd/TestDynamicUpdateBad3.csv", ',', true); CSVConfig const kTestDynamicFDUpdateBad4 = CreateCsvConfig("dynamic_fd/TestDynamicUpdateBad4.csv", ',', true); +CSVConfig const kTestSeparator = CreateCsvConfig("TestSeparator.csv", ',', false); +CSVConfig const kTestSeparator1 = CreateCsvConfig("TestSeparator1.csv", ',', false); } // namespace tests diff --git a/src/tests/all_csv_configs.h b/src/tests/all_csv_configs.h index 40e7c25e86..33c1f435a0 100644 --- a/src/tests/all_csv_configs.h +++ b/src/tests/all_csv_configs.h @@ -99,4 +99,6 @@ extern CSVConfig const kTestDynamicFDUpdateBad1; extern CSVConfig const kTestDynamicFDUpdateBad2; extern CSVConfig const kTestDynamicFDUpdateBad3; extern CSVConfig const kTestDynamicFDUpdateBad4; +extern CSVConfig const kTestSeparator; +extern CSVConfig const kTestSeparator1; } // namespace tests diff --git a/src/tests/test_util.cpp b/src/tests/test_util.cpp index db5e5ff08a..bd261a9967 100644 --- a/src/tests/test_util.cpp +++ b/src/tests/test_util.cpp @@ -11,6 +11,7 @@ #include "model/table/agree_set_factory.h" #include "model/table/column_layout_relation_data.h" #include "model/table/identifier_set.h" +#include "separator_validator.h" namespace tests { @@ -230,4 +231,35 @@ INSTANTIATE_TEST_SUITE_P(TestLevenshteinSuite, TestLevenshtein, TestLevenshteinParam("", "book", 4), TestLevenshteinParam("randomstring", "juststring", 6))); +struct TestSeparatorValidationParam { + CSVConfig csv_config; + char test_separator; + std::optional expected_separator; +}; + +class TestSeparatorValidation : public ::testing::TestWithParam {}; + +TEST_P(TestSeparatorValidation, Default) { + TestSeparatorValidationParam const& p = GetParam(); + std::optional actual = util::ValidateSeparator(p.csv_config.path, p.test_separator).first; + EXPECT_EQ(actual, p.expected_separator); +} + +INSTANTIATE_TEST_SUITE_P(TestSeparatorValidationSuite, TestSeparatorValidation, + ::testing::Values(TestSeparatorValidationParam(kTest1, ',', ','), + TestSeparatorValidationParam(kTest1, ';', ';'), + TestSeparatorValidationParam(kTest1, '1', std::nullopt), + TestSeparatorValidationParam(kTestFD, ',', ','), + TestSeparatorValidationParam(kTestFD, ';', ','), + TestSeparatorValidationParam(kAdult, ';', ';'), + TestSeparatorValidationParam(kAdult, ',', ';'), + TestSeparatorValidationParam(kAbalone, ',', ','), + TestSeparatorValidationParam(kAbalone, '.', ','), + TestSeparatorValidationParam(kTestParse, ',', ','), + TestSeparatorValidationParam(kTestSeparator, ',', ','), + TestSeparatorValidationParam(kTestSeparator, ';', ','), + TestSeparatorValidationParam(kTestSeparator1, ',', ','), + TestSeparatorValidationParam(kTestSeparator1, ';', + ';'))); + } // namespace tests diff --git a/test_input_data/TestSeparator.csv b/test_input_data/TestSeparator.csv new file mode 100644 index 0000000000..262f69f1b2 --- /dev/null +++ b/test_input_data/TestSeparator.csv @@ -0,0 +1,3 @@ +"a,;b",c +"a;","b,c" +a,b;c diff --git a/test_input_data/TestSeparator1.csv b/test_input_data/TestSeparator1.csv new file mode 100644 index 0000000000..222f7b44c8 --- /dev/null +++ b/test_input_data/TestSeparator1.csv @@ -0,0 +1,3 @@ +,;,;,; +;,;,;, +,;,;,;