Skip to content

Commit

Permalink
Add check for unneeded jsonl columns
Browse files Browse the repository at this point in the history
  • Loading branch information
azahed98 authored Dec 11, 2024
2 parents 49b8c28 + 784fd4f commit 6a3dc7e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "together"
version = "1.3.6"
version = "1.3.7"
authors = [
"Together AI <[email protected]>"
]
Expand Down
15 changes: 14 additions & 1 deletion src/together/utils/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
raise InvalidFileFormatError(
message=(
f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
'Example of valid json: {"text": "my sample string"}. '
"Datasets must follow text, conversational, or instruction format. For more"
"information, see https://docs.together.ai/docs/fine-tuning-data-preparation"
),
line_number=idx + 1,
error_source="line_type",
Expand All @@ -142,6 +143,18 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
error_source="format",
)

# Check that there are no extra columns
for column in json_line:
if (
column
not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
):
raise InvalidFileFormatError(
message=f'Found extra column "{column}" in the line {idx + 1}.',
line_number=idx + 1,
error_source="format",
)

if current_format is None:
raise InvalidFileFormatError(
message=(
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/test_files_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,14 @@ def test_check_jsonl_wrong_turn_type(tmp_path: Path):
"Invalid format on line 1 of the input file. Expected a dictionary"
in report["message"]
)


def test_check_jsonl_extra_column(tmp_path: Path):
file = tmp_path / "extra_column.jsonl"
content = [{"text": "Hello, world!", "extra_column": "extra"}]
with file.open("w") as f:
f.write("\n".join(json.dumps(item) for item in content))

report = check_file(file)
assert not report["is_check_passed"]
assert "Found extra column" in report["message"]

0 comments on commit 6a3dc7e

Please sign in to comment.