-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathenumerate.rs
312 lines (273 loc) · 12 KB
/
enumerate.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
static USAGE: &str = r#"
Add a new column enumerating the lines of a CSV file. This can be useful to keep
track of a specific line order, give a unique identifier to each line or even
make a copy of the contents of a column.
The enum function has four modes of operation:
1. INCREMENT. Add an incremental identifier to each of the lines:
$ qsv enum file.csv
2. UUID4. Add a uuid v4 to each of the lines:
$ qsv enum --uuid4 file.csv
3. UUID7. Add a uuid v7 to each of the lines:
$ qsv enum --uuid7 file.csv
3. CONSTANT. Create a new column filled with a given value:
$ qsv enum --constant 0
4. COPY. Copy the contents of a column to a new one:
$ qsv enum --copy names
5. HASH. Create a new column with the deterministic hash of the given column/s.
The hash uses the xxHash algorithm and is platform-agnostic.
(see https://github.com/DoumanAsh/xxhash-rust for more information):
$ qsv enum --hash 1- // hash all columns, auto-ignores existing "hash" column
$ qsv enum --hash col2,col3,col4 // hash specific columns
$ qsv enum --hash col2 // hash a single column
$ qsv enum --hash /record_id|name|address/ // hash columns that match a regex
$ qsv enum --hash !/record_id/ // hash all columns except the record_id column
Finally, you should also be able to shuffle the lines of a CSV file by sorting
on the generated uuid4s:
$ qsv enum --uuid4 file.csv | qsv sort -s uuid4 > shuffled.csv
This will shuffle the lines of the file.csv file as uuids generated using the v4
specification are random and for practical purposes, are unique (1 in 2^122).
See https://en.wikipedia.org/wiki/Universally_unique_identifier#Collisions
However, sorting on uuid7 identifiers will not work as they are time-based
and monotonically increasing, and will not shuffle the lines.
Usage:
qsv enum [options] [<input>]
qsv enum --help
enum options:
-c, --new-column <name> Name of the column to create.
Will default to "index".
--start <value> The value to start the enumeration from.
Only applies in Increment mode.
(default: 0)
--increment <value> The value to increment the enumeration by.
Only applies in Increment mode.
(default: 1)
--constant <value> Fill a new column with the given value.
Changes the default column name to "constant" unless
overridden by --new-column.
To specify a null value, pass the literal "<NULL>".
--copy <column> Name of a column to copy.
Changes the default column name to "{column}_copy"
unless overridden by --new-column.
--uuid4 When set, the column will be populated with
uuids (v4) instead of the incremental identifier.
Changes the default column name to "uuid4" unless
overridden by --new-column.
--uuid7 When set, the column will be populated with
uuids (v7) instead of the incremental identifier.
uuid v7 is a time-based uuid and is monotonically increasing.
See https://buildkite.com/blog/goodbye-integers-hello-uuids
Changes the default column name to "uuid7" unless
overridden by --new-column.
--hash <columns> Create a new column filled with the hash of the
given column/s. Use "1-" to hash all columns.
Changes the default column name to "hash" unless
overridden by --new-column.
Will remove an existing "hash" column if it exists.
The <columns> argument specify the columns to use
in the hash. Columns can be referenced by name or index,
starting at 1. Specify multiple columns by separating
them with a comma. Specify a range of columns with `-`.
(See 'qsv select --help' for the full syntax.)
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;
use serde::Deserialize;
use uuid::Uuid;
use xxhash_rust::xxh3::xxh3_64;
use crate::{
config::{Config, Delimiter},
select::SelectColumns,
util, CliResult,
};
const NULL_VALUE: &str = "<NULL>";
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_new_column: Option<String>,
flag_start: u64,
flag_increment: Option<u64>,
flag_constant: Option<String>,
flag_copy: Option<SelectColumns>,
flag_uuid4: bool,
flag_uuid7: bool,
flag_hash: Option<SelectColumns>,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
}
#[derive(PartialEq)]
enum EnumOperation {
Increment,
Uuid4,
Uuid7,
Constant,
Copy,
Hash,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let mut rconfig = Config::new(args.arg_input.as_ref())
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers);
let mut rdr = rconfig.reader()?;
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
let mut headers = rdr.byte_headers()?.clone();
let mut hash_index = None;
let mut copy_index = 0;
let mut copy_operation = false;
if let Some(column_name) = args.flag_copy {
rconfig = rconfig.select(column_name);
let sel = rconfig.selection(&headers)?;
copy_index = *sel.iter().next().unwrap();
copy_operation = true;
}
let mut hash_sel = None;
if let Some(hash_columns) = &args.flag_hash {
// get the index of the column named "hash", if it exists
hash_index = headers.iter().position(|col| col == b"hash");
// get the original selection
rconfig = rconfig.select(hash_columns.clone());
let original_selection = rconfig
.clone()
.select(hash_columns.clone())
.selection(&headers)?;
// Filter out the "hash" column from the original selection, if it exists
let filtered_selection = original_selection
.iter()
.filter(|&&index| index != hash_index.unwrap_or(usize::MAX))
.collect::<Vec<_>>();
// Construct selection string without "hash" column
let selection_string = filtered_selection
.iter()
.map(|&&index| (index + 1).to_string())
.collect::<Vec<String>>()
.join(",");
// Parse the new selection without "hash" column
let no_hash_column_selection = SelectColumns::parse(&selection_string)?;
// Update the configuration with the new selection
rconfig = rconfig.select(no_hash_column_selection);
hash_sel = Some(rconfig.selection(&headers)?);
}
let constant_value = if args.flag_constant == Some(NULL_VALUE.to_string()) {
b""
} else {
args.flag_constant.as_deref().unwrap_or("").as_bytes()
};
let enum_operation = if args.flag_constant.is_some() {
EnumOperation::Constant
} else if args.flag_uuid4 {
EnumOperation::Uuid4
} else if args.flag_uuid7 {
EnumOperation::Uuid7
} else if copy_operation {
EnumOperation::Copy
} else if args.flag_hash.is_some() {
EnumOperation::Hash
} else {
EnumOperation::Increment
};
if !rconfig.no_headers {
if enum_operation == EnumOperation::Hash {
// Remove an existing "hash" column from the header, if it exists
headers = if let Some(hash_index) = hash_index {
headers
.into_iter()
.enumerate()
.filter_map(|(i, field)| if i == hash_index { None } else { Some(field) })
.collect()
} else {
headers
};
}
let column_name = if let Some(new_column_name) = args.flag_new_column {
new_column_name
} else {
match enum_operation {
EnumOperation::Increment => "index".to_string(),
EnumOperation::Uuid4 => "uuid4".to_string(),
EnumOperation::Uuid7 => "uuid7".to_string(),
EnumOperation::Constant => "constant".to_string(),
EnumOperation::Copy => {
let current_header = match simdutf8::compat::from_utf8(&headers[copy_index]) {
Ok(s) => s,
Err(e) => return fail_clierror!("Could not parse header as utf-8!: {e}"),
};
format!("{current_header}_copy")
},
EnumOperation::Hash => "hash".to_string(),
}
};
headers.push_field(column_name.as_bytes());
wtr.write_byte_record(&headers)?;
}
// amortize allocations
let mut record = csv::ByteRecord::new();
let mut counter: u64 = args.flag_start;
#[allow(unused_assignments)]
let mut colcopy: Vec<u8> = Vec::with_capacity(20);
let increment = args.flag_increment.unwrap_or(1);
let mut hash_string = String::new();
let mut hash;
let uuid7_ctxt = uuid::ContextV7::new();
let mut uuid;
while rdr.read_byte_record(&mut record)? {
match enum_operation {
EnumOperation::Increment => {
record.push_field(itoa::Buffer::new().format(counter).as_bytes());
counter += increment;
},
EnumOperation::Uuid4 => {
uuid = Uuid::new_v4();
record.push_field(
uuid.as_hyphenated()
.encode_lower(&mut Uuid::encode_buffer())
.as_bytes(),
);
},
EnumOperation::Uuid7 => {
uuid = Uuid::new_v7(uuid::Timestamp::now(&uuid7_ctxt));
record.push_field(
uuid.as_hyphenated()
.encode_lower(&mut Uuid::encode_buffer())
.as_bytes(),
);
},
EnumOperation::Constant => {
record.push_field(constant_value);
},
EnumOperation::Copy => {
colcopy = record[copy_index].to_vec();
record.push_field(&colcopy);
},
EnumOperation::Hash => {
hash_string.clear();
// build the hash string from the filtered selection
if let Some(ref sel) = hash_sel {
sel.iter().for_each(|i| {
hash_string
.push_str(simdutf8::basic::from_utf8(&record[*i]).unwrap_or_default());
});
}
hash = xxh3_64(hash_string.as_bytes());
// Optionally remove the "hash" column if it already exists from the output
record = if let Some(hash_index) = hash_index {
record
.into_iter()
.enumerate()
.filter_map(|(i, field)| if i == hash_index { None } else { Some(field) })
.collect()
} else {
record
};
record.push_field(hash.to_string().as_bytes());
},
}
wtr.write_byte_record(&record)?;
}
Ok(wtr.flush()?)
}