Skip to content

Commit

Permalink
Replace parquet.js with hyparquet
Browse files Browse the repository at this point in the history
  • Loading branch information
platypii committed Dec 9, 2024
1 parent 89c726d commit b8bfd1c
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 108 deletions.
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@
"dependencies": {
"@types/node": "^18.11.18",
"@types/node-fetch": "^2.6.4",
"@types/parquetjs": "^0.10.6",
"@types/progress-stream": "^2.0.5",
"abort-controller": "^3.0.0",
"agentkeepalive": "^4.2.1",
"axios": "^1.7.7",
"form-data-encoder": "1.7.2",
"formdata-node": "^4.3.2",
"hyparquet": "^1.6.3",
"hyparquet-compressors": "^0.1.4",
"node-fetch": "^2.6.7",
"parquetjs": "^0.11.2",
"progress-stream": "^2.0.0"
},
"devDependencies": {
Expand Down
14 changes: 5 additions & 9 deletions src/lib/upload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ import fetch from 'node-fetch';
import * as path from 'path';
import progress from 'progress-stream';
import readline from 'readline';
import pkg from 'parquetjs';
const { ParquetReader } = pkg;
import { asyncBufferFromFile, parquetMetadataAsync, SchemaElement } from 'hyparquet';

export interface FileResponse {
id: string;
Expand Down Expand Up @@ -76,11 +75,10 @@ export async function check_file(fileName: string): Promise<CheckFileResponse> {

export async function check_parquet(fileName: string): Promise<string | undefined> {
try {
const reader = await ParquetReader.openFile(fileName);
const cursor = reader.getCursor();
let record = null;
const asyncBuffer = await asyncBufferFromFile(fileName);
const metadata = await parquetMetadataAsync(asyncBuffer);

const fieldNames = Object.keys(reader.schema.fields);
const fieldNames = metadata.schema.map((field: SchemaElement) => field.name);
if (!('input_ids' in fieldNames)) {
return `Parquet file ${fileName} does not contain the 'input_ids' column.`;
}
Expand All @@ -93,12 +91,10 @@ export async function check_parquet(fileName: string): Promise<string | undefine
}
}

const numRows = reader.getRowCount() as unknown as number;
const numRows = metadata.num_rows;
if (numRows < MIN_SAMPLES) {
return `Parquet file ${fileName} contains only ${numRows} samples. Minimum of ${MIN_SAMPLES} samples are required`;
}

await reader.close();
} catch (err) {
return `failed to read parquet file ${fileName}`;
}
Expand Down
120 changes: 23 additions & 97 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -859,13 +859,6 @@
"@types/node" "*"
form-data "^4.0.0"

"@types/node-int64@*":
version "0.4.32"
resolved "https://registry.yarnpkg.com/@types/node-int64/-/node-int64-0.4.32.tgz#a540bcb9e48816ca1b5329d1ab907d6ad134b856"
integrity sha512-xf/JsSlnXQ+mzvc0IpXemcrO4BrCfpgNpMco+GLcXkFk01k/gW9lGJu+Vof0ZSvHK6DsHJDPSbjFPs36QkWXqw==
dependencies:
"@types/node" "*"

"@types/node@*":
version "20.10.5"
resolved "https://registry.yarnpkg.com/@types/node/-/node-20.10.5.tgz#47ad460b514096b7ed63a1dae26fad0914ed3ab2"
Expand All @@ -878,13 +871,6 @@
resolved "https://registry.yarnpkg.com/@types/node/-/node-18.11.18.tgz#8dfb97f0da23c2293e554c5a50d61ef134d7697f"
integrity sha512-DHQpWGjyQKSHj3ebjFI/wRKcqQcdR+MoFBygntYOZytCqNfkd2ZC4ARDJ2DQqhjH5p85Nnd3jhUJIXrszFX/JA==

"@types/parquetjs@^0.10.6":
version "0.10.6"
resolved "https://registry.yarnpkg.com/@types/parquetjs/-/parquetjs-0.10.6.tgz#7e4b54d9d336a8dda9c7a9091ec7f60db98744af"
integrity sha512-ZCsD6j97YD0mGU8/VnVs3NjORXa7zeHvqlpJpCqy4jU8a1O21dalL+MFn9QNbdEfy8rszR1N7NHeT7/LdtHf+A==
dependencies:
"@types/node-int64" "*"

"@types/progress-stream@^2.0.5":
version "2.0.5"
resolved "https://registry.yarnpkg.com/@types/progress-stream/-/progress-stream-2.0.5.tgz#50f10be88b0717c8fce6573e7fcafa8eabbc3dcf"
Expand Down Expand Up @@ -1194,21 +1180,11 @@ balanced-match@^1.0.0:
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee"
integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==

base64-js@^1.1.2:
version "1.5.1"
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.5.1.tgz#1b1b440160a5bf7ad40b650f095963481903930a"
integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==

big-integer@^1.6.44:
version "1.6.52"
resolved "https://registry.yarnpkg.com/big-integer/-/big-integer-1.6.52.tgz#60a887f3047614a8e1bffe5d7173490a97dc8c85"
integrity sha512-QxD8cf2eVqJOOz63z6JIN9BzvVs/dlySa5HGSBH5xtR8dPteIRQnBxxKqkNTiT6jbDTF6jAfrd4oMcND9RGbQg==

bindings@~1.2.1:
version "1.2.1"
resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.2.1.tgz#14ad6113812d2d37d72e67b4cacb4bb726505f11"
integrity sha512-u4cBQNepWxYA55FunZSM7wMi55yQaN0otnhhilNoWHq0MfOfJeQx0v0mRRpolGOExPjZcl6FtB0BB8Xkb88F0g==

bplist-parser@^0.2.0:
version "0.2.0"
resolved "https://registry.yarnpkg.com/bplist-parser/-/bplist-parser-0.2.0.tgz#43a9d183e5bf9d545200ceac3e712f79ebbe8d0e"
Expand Down Expand Up @@ -1238,13 +1214,6 @@ braces@^3.0.3:
dependencies:
fill-range "^7.1.1"

brotli@^1.3.0:
version "1.3.3"
resolved "https://registry.yarnpkg.com/brotli/-/brotli-1.3.3.tgz#7365d8cc00f12cf765d2b2c898716bcf4b604d48"
integrity sha512-oTKjJdShmDuGW94SyyaoQvAjf30dZaHnjJ8uAF+u2/vGJkJbJPJAT1gDiOJP5v1Zb6f9KEyW/1HpuaWIXtGHPg==
dependencies:
base64-js "^1.1.2"

browserslist@^4.22.2:
version "4.22.2"
resolved "https://registry.yarnpkg.com/browserslist/-/browserslist-4.22.2.tgz#704c4943072bd81ea18997f3bd2180e89c77874b"
Expand All @@ -1269,11 +1238,6 @@ [email protected]:
dependencies:
node-int64 "^0.4.0"

bson@^1.0.4:
version "1.1.6"
resolved "https://registry.yarnpkg.com/bson/-/bson-1.1.6.tgz#fb819be9a60cd677e0853aee4ca712a785d6618a"
integrity sha512-EvVNVeGo4tHxwi8L6bPj3y3itEvStdwvvlojVxxbyYfoaxJ6keLgrTuKdyfEAszFK+H3olzBuafE0yoh0D1gdg==

buffer-from@^1.0.0:
version "1.1.2"
resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5"
Expand Down Expand Up @@ -1874,6 +1838,11 @@ function-bind@^1.1.2:
resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.2.tgz#2c02d864d97f3ea6c8830c464cbd11ab6eab7a1c"
integrity sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==

[email protected]:
version "0.1.1"
resolved "https://registry.yarnpkg.com/fzstd/-/fzstd-0.1.1.tgz#a3da29f2fff45070ca90073f866d97e0c56a4a52"
integrity sha512-dkuVSOKKwh3eas5VkJy1AW1vFpet8TA/fGmVA5krThl8YcOVE/8ZIoEA1+U1vEn5ckxxhLirSdY837azmbaNHA==

gensync@^1.0.0-beta.2:
version "1.0.0-beta.2"
resolved "https://registry.yarnpkg.com/gensync/-/gensync-1.0.0-beta.2.tgz#32a6ee76c3d7f52d46b2b1ae5d93fea8580a25e0"
Expand Down Expand Up @@ -1998,6 +1967,24 @@ humanize-ms@^1.2.1:
dependencies:
ms "^2.0.0"

hyparquet-compressors@^0.1.4:
version "0.1.4"
resolved "https://registry.yarnpkg.com/hyparquet-compressors/-/hyparquet-compressors-0.1.4.tgz#4039ff47e496970a028d8525923ed08bced342bf"
integrity sha512-41Twmk9mk/EsRYg+xR9/i/ZZFlES2Rjzb24ROklewEdwYLIhMuFzjfayCJYAdOiv+DM0bf1Xo+UQO3ZgPFJeEg==
dependencies:
fzstd "0.1.1"
hysnappy "0.3.1"

hyparquet@^1.6.3:
version "1.6.3"
resolved "https://registry.yarnpkg.com/hyparquet/-/hyparquet-1.6.3.tgz#8ddc84c0023cc00bc5ef33246df48ba23b36c38d"
integrity sha512-JwD3bcRueKs7/0iQG8xyJxh6OAXSQdzh0dJkMtzQPSDOTuvoKniDgdOPB9oXGspbUhp5c9uW6coqYiFddKJ+cw==

[email protected]:
version "0.3.1"
resolved "https://registry.yarnpkg.com/hysnappy/-/hysnappy-0.3.1.tgz#dd823a4b1de7980f6deac7cc77d28c061605872e"
integrity sha512-Hv2cv3pzoqxFVaTOnSoTMoXAd1WhWwlkvxMmr1cUlOXSs8hF03vsYZPXPkrTFSBF7abDkfZZtYcbz8ZBglsRYw==

iconv-lite@^0.6.3:
version "0.6.3"
resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501"
Expand Down Expand Up @@ -2049,11 +2036,6 @@ inherits@2, inherits@^2.0.3, inherits@~2.0.3:
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==

int53@^0.2.4:
version "0.2.4"
resolved "https://registry.yarnpkg.com/int53/-/int53-0.2.4.tgz#5ed8d7aad6c5c6567cae69aa7ffc4a109ee80f86"
integrity sha512-a5jlKftS7HUOhkUyYD7j2sJ/ZnvWiNlZS1ldR+g1ifQ+/UuZXIE+YTc/lK1qGj/GwAU5F8Z0e1eVq2t1J5Ob2g==

is-arrayish@^0.2.1:
version "0.2.1"
resolved "https://registry.yarnpkg.com/is-arrayish/-/is-arrayish-0.2.1.tgz#77c99840527aa8ecb1a8ba697b80645a7a926a9d"
Expand Down Expand Up @@ -2664,13 +2646,6 @@ lru-cache@^6.0.0:
dependencies:
yallist "^4.0.0"

lzo@^0.4.0:
version "0.4.11"
resolved "https://registry.yarnpkg.com/lzo/-/lzo-0.4.11.tgz#0e76d582567b29e285cb84a6aa392cb94c6283f8"
integrity sha512-apQHNoW2Alg72FMqaC/7pn03I7umdgSVFt2KRkCXXils4Z9u3QBh1uOtl2O5WmZIDLd9g6Lu4lIdOLmiSTFVCQ==
dependencies:
bindings "~1.2.1"

make-dir@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-4.0.0.tgz#c3c2307a771277cd9638305f915c29ae741b614e"
Expand Down Expand Up @@ -2805,11 +2780,6 @@ npm-run-path@^5.1.0:
dependencies:
path-key "^4.0.0"

[email protected]:
version "0.0.1"
resolved "https://registry.yarnpkg.com/object-stream/-/object-stream-0.0.1.tgz#3a03a26e94fd112c9abffeb4651e07a5e23cf840"
integrity sha512-+NPJnRvX9RDMRY9mOWOo/NDppBjbZhXirNNSu2IBnuNboClC9h1ZGHXgHBLDbJMHsxeJDq922aVmG5xs24a/cA==

once@^1.3.0:
version "1.4.0"
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
Expand Down Expand Up @@ -2907,21 +2877,6 @@ parent-module@^1.0.0:
dependencies:
callsites "^3.0.0"

parquetjs@^0.11.2:
version "0.11.2"
resolved "https://registry.yarnpkg.com/parquetjs/-/parquetjs-0.11.2.tgz#ea13221b3583cb1277f8b4b879776420f8863660"
integrity sha512-Y6FOc3Oi2AxY4TzJPz7fhICCR8tQNL3p+2xGQoUAMbmlJBR7+JJmMrwuyMjIpDiM7G8Wj/8oqOH4UDUmu4I5ZA==
dependencies:
brotli "^1.3.0"
bson "^1.0.4"
int53 "^0.2.4"
object-stream "0.0.1"
snappyjs "^0.6.0"
thrift "^0.11.0"
varint "^5.0.0"
optionalDependencies:
lzo "^0.4.0"

parse-json@^5.2.0:
version "5.2.0"
resolved "https://registry.yarnpkg.com/parse-json/-/parse-json-5.2.0.tgz#c76fc66dee54231c962b22bcc8a72cf2f99753cd"
Expand Down Expand Up @@ -3046,11 +3001,6 @@ pure-rand@^6.0.0:
resolved "https://registry.yarnpkg.com/pure-rand/-/pure-rand-6.0.4.tgz#50b737f6a925468679bff00ad20eade53f37d5c7"
integrity sha512-LA0Y9kxMYv47GIPJy6MI84fqTd2HmYZI83W/kM/SkKfDlajnZYfmXFTxkbY+xSBPkLJxltMa9hIkmdc29eguMA==

q@^1.5.0:
version "1.5.1"
resolved "https://registry.yarnpkg.com/q/-/q-1.5.1.tgz#7e32f75b41381291d04611f1bf14109ac00651d7"
integrity sha512-kV/CThkXo6xyFEZUugw/+pIOywXcDbFYgSct5cT3gqlbkBE1SJdwy6UQoZvodiWF/ckQLZyDE/Bu1M6gVu5lVw==

queue-microtask@^1.2.2:
version "1.2.3"
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
Expand Down Expand Up @@ -3204,11 +3154,6 @@ slash@^3.0.0:
resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634"
integrity sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==

snappyjs@^0.6.0:
version "0.6.1"
resolved "https://registry.yarnpkg.com/snappyjs/-/snappyjs-0.6.1.tgz#9bca9ff8c54b133a9cc84a71d22779e97fc51878"
integrity sha512-YIK6I2lsH072UE0aOFxxY1dPDCS43I5ktqHpeAsuLNYWkE5pGxRGWfDM4/vSUfNzXjC1Ivzt3qx31PCLmc9yqg==

[email protected]:
version "0.5.13"
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
Expand Down Expand Up @@ -3362,15 +3307,6 @@ text-table@^0.2.0:
resolved "https://registry.yarnpkg.com/text-table/-/text-table-0.2.0.tgz#7f5ee823ae805207c00af2df4a84ec3fcfa570b4"
integrity sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==

thrift@^0.11.0:
version "0.11.0"
resolved "https://registry.yarnpkg.com/thrift/-/thrift-0.11.0.tgz#256115e4ff87871e12537f4b510bd2b425e13990"
integrity sha512-UpsBhOC45a45TpeHOXE4wwYwL8uD2apbHTbtBvkwtUU4dNwCjC7DpQTjw2Q6eIdfNtw+dKthdwq94uLXTJPfFw==
dependencies:
node-int64 "^0.4.0"
q "^1.5.0"
ws ">= 2.2.3"

through2@~2.0.3:
version "2.0.5"
resolved "https://registry.yarnpkg.com/through2/-/through2-2.0.5.tgz#01c1e39eb31d07cb7d03a96a70823260b23132cd"
Expand Down Expand Up @@ -3550,11 +3486,6 @@ v8-to-istanbul@^9.0.1:
"@types/istanbul-lib-coverage" "^2.0.1"
convert-source-map "^2.0.0"

varint@^5.0.0:
version "5.0.2"
resolved "https://registry.yarnpkg.com/varint/-/varint-5.0.2.tgz#5b47f8a947eb668b848e034dcfa87d0ff8a7f7a4"
integrity sha512-lKxKYG6H03yCZUpAGOPOsMcGxd1RHCu1iKvEHYDPmTyq2HueGhD73ssNBqqQWfvYs04G9iUFRvmAVLW20Jw6ow==

walker@^1.0.8:
version "1.0.8"
resolved "https://registry.yarnpkg.com/walker/-/walker-1.0.8.tgz#bd498db477afe573dc04185f011d3ab8a8d7653f"
Expand Down Expand Up @@ -3609,11 +3540,6 @@ write-file-atomic@^4.0.2:
imurmurhash "^0.1.4"
signal-exit "^3.0.7"

"ws@>= 2.2.3":
version "8.18.0"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.0.tgz#0d7505a6eafe2b0e712d232b42279f53bc289bbc"
integrity sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==

xtend@~4.0.1:
version "4.0.2"
resolved "https://registry.yarnpkg.com/xtend/-/xtend-4.0.2.tgz#bb72779f5fa465186b1f438f674fa347fdb5db54"
Expand Down

0 comments on commit b8bfd1c

Please sign in to comment.