Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CSV parsing functions #2361

Merged
merged 11 commits into from
Jan 13, 2025
57 changes: 52 additions & 5 deletions basex-core/src/main/java/org/basex/build/csv/CsvOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ public class CsvOptions extends Options {
public static final BooleanOption LAX = new BooleanOption("lax", true);
/** Option: parse quotes. */
public static final BooleanOption QUOTES = new BooleanOption("quotes", true);
/** Option: row delimiter. */
public static final StringOption ROW_DELIMITER = new StringOption("row-delimiter", "\n");
/** Option: quote character. */
public static final StringOption QUOTE_CHARACTER = new StringOption("quote", "\"");
/** Option: trim whitespace. */
public static final BooleanOption TRIM_WHITESPACE = new BooleanOption("trim-whitespace", false);
/** Option: strict quoting (implies QUOTES). */
public static final BooleanOption STRICT_QUOTING = new BooleanOption("strict-quoting", false);
/** Option: trim-rows. */
public static final BooleanOption TRIM_ROWS = new BooleanOption("trim-rows", false);
/** Option: select-columns. */
public static final NumbersOption SELECT_COLUMNS = new NumbersOption("select-columns");

/** CSV formats. */
public enum CsvFormat {
Expand Down Expand Up @@ -70,14 +82,24 @@ public String toString() {
@Override
public synchronized void assign(final String name, final String value) throws BaseXException {
super.assign(name, value);
if(separator() == -1) throw new BaseXException("Invalid separator: '%'", get(SEPARATOR));
final int s = separator(), r = rowDelimiter(), q = quoteCharacter();
if(s == -1) throw new BaseXException("Invalid separator: '%'", get(SEPARATOR));
if(r == -1) throw new BaseXException("Invalid row delimiter: '%'", get(ROW_DELIMITER));
if(q == -1) throw new BaseXException("Invalid quote character: '%'", get(QUOTE_CHARACTER));
if(s == q || r == s || q == r) throw new BaseXException("Duplicate CSV delimiter error: '%'",
get(s == q || r == s ? SEPARATOR : QUOTE_CHARACTER));
}

@Override
public synchronized void assign(final Item name, final Value value, final InputInfo info)
throws QueryException {
super.assign(name, value, info);
if(separator() == -1) throw OPTION_X.get(info, "Invalid separator: '%'", get(SEPARATOR));
final int s = separator(), r = rowDelimiter(), q = quoteCharacter();
if(s == -1) throw OPTION_X.get(info, "Invalid separator: '%'", get(SEPARATOR));
if(r == -1) throw CSV_SINGLECHAR_X_X.get(info, ROW_DELIMITER.name(), get(ROW_DELIMITER));
if(q == -1) throw CSV_SINGLECHAR_X_X.get(info, QUOTE_CHARACTER.name(), get(QUOTE_CHARACTER));
if(s == q || r == s || q == r) throw CSV_DELIMITER_X.get(info,
get(s == q || r == s ? SEPARATOR : QUOTE_CHARACTER));
}

/**
Expand All @@ -89,9 +111,34 @@ public int separator() {
for(final CsvSep s : CsvSep.values()) {
if(sep.equals(s.toString())) return s.sep;
}
if(sep.length() == 1) {
final char ch = sep.charAt(0);
if(XMLToken.valid(ch)) return ch;
return validate(sep);
}

/**
* Returns the row delimiter character or {@code -1} if character is invalid.
* @return separator
*/
public int rowDelimiter() {
return validate(get(ROW_DELIMITER));
}

/**
* Returns the quote character or {@code -1} if character is invalid.
* @return separator
*/
public int quoteCharacter() {
return validate(get(QUOTE_CHARACTER));
}

/**
* Validates a single code point passed as a string.
* @param single single character string
* @return code point or {@code -1}
*/
private int validate(final String single) {
if(single.codePointCount(0, single.length()) == 1) {
final int cp = single.codePointAt(0);
if(XMLToken.valid(cp)) return cp;
}
return -1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.basex.query.*;
import org.basex.query.util.*;
import org.basex.query.value.item.*;
import org.basex.util.*;
import org.basex.util.list.*;

/**
Expand Down Expand Up @@ -66,10 +67,22 @@ protected CsvConverter(final CsvParserOptions copts) {
* @throws IOException I/O exception
*/
public final Item convert(final IO input) throws QueryException, IOException {
return convert(input, null);
}

/**
* Converts the specified input to an XQuery value.
* @param input input
* @param ii input info (can be {@code null})
* @return result
* @throws QueryException query exception
* @throws IOException I/O exception
*/
public final Item convert(final IO input, final InputInfo ii) throws QueryException, IOException {
init(input.url());
try(NewlineInput in = new NewlineInput(input)) {
nli = in.encoding(copts.get(CsvParserOptions.ENCODING));
new CsvParser(in, copts, this).parse();
new CsvParser(in, copts, this).parse(ii);
}
return finish();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,6 @@ protected FNode finish() {
* Finishes a record.
*/
private void finishRecord() {
if(record != null && !record.isEmpty()) root.add(record);
if(record != null) root.add(record);
}
}
96 changes: 76 additions & 20 deletions basex-core/src/main/java/org/basex/io/parse/csv/CsvParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@

import org.basex.build.csv.*;
import org.basex.io.in.*;
import org.basex.query.*;
import org.basex.query.value.item.*;
import org.basex.query.value.type.*;
import org.basex.util.*;
import org.basex.util.list.*;

/**
* A CSV parser generating parse events similar to a SAX XML parser.
Expand All @@ -23,34 +27,63 @@ public final class CsvParser {
private final boolean backslashes;
/** Column separator (see {@link CsvOptions#SEPARATOR}). */
private final int separator;
/** Row delimiter (see {@link CsvOptions#ROW_DELIMITER}). */
private final int rowDelimiter;
/** Quote character (see {@link CsvOptions#QUOTE_CHARACTER}). */
private final int quoteCharacter;
/** Parse quotes. */
private final boolean quotes;
/** Trim whitespace (see {@link CsvOptions#TRIM_WHITESPACE}). */
private final boolean trimWhitespace;
/** Trim rows (see {@link CsvOptions#TRIM_ROWS}). */
private final boolean trimRows;
/** Disallow field content outside of quotes. */
private final boolean strictQuoting;
/** Select columns. */
private final int[] selectColumns;

/** First entry of a line. */
private boolean first = true;
/** Number of fields in first row. */
private int rowSize = -1;
/** Data mode. */
private boolean data;
/** Fields of the current row. */
private final TokenList fields = new TokenList();

/**
* Constructor.
* @param input input
* @param opts options
* @param conv converter
* @throws QueryException query exception
*/
public CsvParser(final TextInput input, final CsvParserOptions opts, final CsvConverter conv) {
public CsvParser(final TextInput input, final CsvParserOptions opts, final CsvConverter conv)
throws QueryException {
this.input = input;
this.conv = conv;
header = opts.get(CsvOptions.HEADER);
separator = opts.separator();
quotes = opts.get(CsvOptions.QUOTES);
rowDelimiter = opts.rowDelimiter();
quoteCharacter = opts.quoteCharacter();
strictQuoting = opts.get(CsvOptions.STRICT_QUOTING);
quotes = strictQuoting || opts.get(CsvOptions.QUOTES);
backslashes = opts.get(CsvOptions.BACKSLASHES);
trimWhitespace = opts.get(CsvOptions.TRIM_WHITESPACE);
trimRows = opts.get(CsvOptions.TRIM_ROWS);
selectColumns = opts.get(CsvOptions.SELECT_COLUMNS);
for(final int sc : selectColumns) {
if(sc < 1) throw QueryError.typeError(Int.get(sc), SeqType.POSITIVE_INTEGER_O, null);
}
}

/**
* Parses a CSV expression.
* @param ii input info (can be @null)
* @throws QueryException query exception
* @throws IOException query I/O exception
*/
public void parse() throws IOException {
public void parse(final InputInfo ii) throws QueryException, IOException {
final TokenBuilder entry = new TokenBuilder();
boolean quoted = false;
data = !header;
Expand All @@ -59,33 +92,41 @@ public void parse() throws IOException {
while(ch != -1) {
if(quoted) {
// quoted state
if(ch == '"') {
if(ch == quoteCharacter) {
ch = input.read();
if(ch != '"') {
if(ch != quoteCharacter) {
quoted = false;
if(strictQuoting && ch != separator && ch != rowDelimiter && ch != -1)
throw QueryError.CSV_QUOTING_X.get(ii,
new String(Character.toChars(quoteCharacter)) + entry
+ new String(Character.toChars(quoteCharacter))
+ new String(Character.toChars(ch)));
continue;
}
if(backslashes) add(entry, '"');
if(backslashes) add(entry, quoteCharacter);
} else if(ch == '\\' && backslashes) {
ch = bs();
}
add(entry, ch);
} else if(ch == '"') {
} else if(ch == quoteCharacter) {
if(quotes && entry.isEmpty()) {
// parse quote
quoted = true;
} else if (strictQuoting) {
throw QueryError.CSV_QUOTING_X.get(ii,
entry + new String(Character.toChars(quoteCharacter)));
} else {
ch = input.read();
if(ch != '"' || backslashes) add(entry, '"');
if(ch != quoteCharacter || backslashes) add(entry, quoteCharacter);
continue;
}
} else if(ch == separator) {
// parse separator
record(entry, true);
record(entry, false, false);
first = false;
} else if(ch == '\n') {
} else if(ch == rowDelimiter) {
// parse newline
record(entry, !entry.isEmpty());
record(entry, false, true);
first = true;
data = true;
} else {
Expand All @@ -94,7 +135,9 @@ public void parse() throws IOException {
}
ch = input.read();
}
record(entry, !entry.isEmpty());
if(quoted && strictQuoting)
throw QueryError.CSV_QUOTING_X.get(ii, new String(Character.toChars(quoteCharacter)) + entry);
record(entry, true, true);
}

/**
Expand Down Expand Up @@ -122,17 +165,30 @@ private static void add(final TokenBuilder entry, final int ch) {
/**
* Adds a new record and entry.
* @param entry entry to be added
* @param record add new record
* @param lastRow whether this is the last row
* @param lastField whether this is the last field of the row
* @throws IOException I/O exception
*/
private void record(final TokenBuilder entry, final boolean record) throws IOException {
if(record && first && data) conv.record();
if(record || !first) {
if(data) {
conv.entry(entry.next());
} else {
conv.header(entry.next());
private void record(final TokenBuilder entry, final boolean lastRow, final boolean lastField)
throws IOException {
final byte[] next = entry.next();
final byte[] field = trimWhitespace ? Token.trim(next) : next;
if(field.length > 0 || !(first && lastField)) fields.add(field);
if(lastField && !(lastRow && fields.isEmpty())) {
if(data) conv.record();
if(rowSize == -1) rowSize = fields.size();
final int n = selectColumns.length != 0 ? selectColumns.length
: trimRows ? rowSize : fields.size();
for(int i = 0; i < n; ++i) {
final int index = selectColumns.length != 0 ? selectColumns[i] - 1 : i;
final byte[] f = index < fields.size() ? fields.get(index) : Token.EMPTY;
if(data) {
conv.entry(f);
} else {
conv.header(f);
}
}
fields.reset();
}
}
}
Loading