296
297
298
299
@@ -1826,7 +2122,15 @@
337
338
339
-340 | def filter_non_primitive(T, expressions, filter_type="all", tqdm=_tqdm):
+340
+341
+342
+343
+344
+345
+346
+347
+348
| def filter_non_primitive(T, expressions, filter_type="all", tqdm=_tqdm):
"""
OBSOLETE
filters table
@@ -1875,7 +2179,7 @@
else:
raise TypeError
# create new tables
- res = _compress_both(T, mask, pbar=pbar)
+ res = compress_both(T, mask, pbar=pbar)
pbar.update(pbar.total - pbar.n)
return res
@@ -2030,15 +2334,7 @@
Source code in tablite/redux.py
- 342
-343
-344
-345
-346
-347
-348
-349
-350
+ 350
351
352
353
@@ -2080,7 +2376,15 @@
389
390
391
-392 | def filter(T, expressions, filter_type="all", tqdm=_tqdm):
+392
+393
+394
+395
+396
+397
+398
+399
+400
| def filter(T, expressions, filter_type="all", tqdm=_tqdm):
"""filters table
Note: At the moment only tablite primitive types are supported
@@ -2122,7 +2426,7 @@
# TODO: make parser for expressions and use the nim implement
mask = _filter_using_expression(T, expressions)
pbar.update(10)
- res = _compress_both(T, mask, pbar=pbar)
+ res = compress_both(T, mask, pbar=pbar)
pbar.update(pbar.total - pbar.n)
elif isinstance(expressions, list):
return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)
diff --git a/master/search/search_index.json b/master/search/search_index.json
index 64828110..b3d33d55 100644
--- a/master/search/search_index.json
+++ b/master/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Tablite","text":""},{"location":"#contents","title":"Contents","text":" - introduction
- installation
- feature overview
- api
- tutorial
- latest updates
- credits
"},{"location":"#introduction","title":"Introduction","text":"Tablite seeks to be the go-to library for manipulating tabular data with an api that is as close in syntax to pure python as possible. "},{"location":"#even-smaller-memory-footprint","title":"Even smaller memory footprint","text":"Tablite uses numpys fileformat as a backend with strong abstraction, so that copy, append & repetition of data is handled in pages. This is imperative for incremental data processing. Tablite tests for memory footprint. One test compares the memory footprint of 10,000,000 integers where tablite will use < 1 Mb RAM in contrast to python which will require around 133.7 Mb of RAM (1M lists with 10 integers). Tablite also tests to assure that working with 1Tb of data is tolerable. Tablite achieves this minimal memory footprint by using a temporary storage set in config.Config.workdir as tempfile.gettempdir()/tablite-tmp . If your OS (windows/linux/mac) sits on a SSD this will benefit from high IOPS and permit slices of 9,000,000,000 rows in less than a second. "},{"location":"#multiprocessing-enabled-by-default","title":"Multiprocessing enabled by default","text":"Tablite uses numpy whereever possible and applies multiprocessing for bypassing the GIL on all major operations. CSV import is performed in C through using nim s compiler and is as fast the hardware allows. "},{"location":"#all-algorithms-have-been-reworked-to-respect-memory-limits","title":"All algorithms have been reworked to respect memory limits","text":"Tablite respects the limits of free memory by tagging the free memory and defining task size before each memory intensive task is initiated (join, groupby, data import, etc). If you still run out of memory you may try to reduce the config.Config.PAGE_SIZE and rerun your program. "},{"location":"#100-support-for-all-python-datatypes","title":"100% support for all python datatypes","text":"Tablite wants to make it easy for you to work with data. tablite.Table's behave like a dict with lists: my_table[column name] = [... data ...] . Tablite uses datatype mapping to native numpy types where possible and uses type mapping for non-native types such as timedelta, None, date, time\u2026 e.g. what you put in, is what you get out. This is inspired by bank python. "},{"location":"#light-weight","title":"Light weight","text":"Tablite is ~200 kB. "},{"location":"#helpful","title":"Helpful","text":"Tablite wants you to be productive, so a number of helpers are available. Table.import_file to import csv*, tsv, txt, xls, xlsx, xlsm, ods, zip and logs. There is automatic type detection (see tutorial.ipynb ) - To peek into any supported file use
get_headers which shows the first 10 rows. - Use
mytable.rows and mytable.columns to iterate over rows or columns. - Create multi-key
.index for quick lookups. - Perform multi-key
.sort , - Filter using
.any and .all to select specific rows. - use multi-key
.lookup and .join to find data across tables. - Perform
.groupby and reorganise data as a .pivot table with max, min, sum, first, last, count, unique, average, st.deviation, median and mode - Append / concatenate tables with
+= which automatically sorts out the columns - even if they're not in perfect order. - Should you tables be similar but not the identical you can use
.stack to \"stack\" tables on top of each other If you're still missing something add it to the wishlist "},{"location":"#installation","title":"Installation","text":"Get it from pypi: Install: pip install tablite Usage: >>> from tablite import Table "},{"location":"#build-test","title":"Build & test","text":"install nim >= 2.0.0 run: chmod +x ./build_nim.sh run: ./build_nim.sh Should the default nim not be your desired taste, please use nims environment manager (atlas ) and run source nim-2.0.0/activate.sh on UNIX or nim-2.0.0/activate.bat on windows. install python >= 3.8\npython -m venv /your/venv/dir\nactivate /your/venv/dir\npip install -r requirements.txt\npip install -r requirements_for_testing.py\npytest ./tests\n "},{"location":"#feature-overview","title":"Feature overview","text":"want to... this way... loop over rows [ row for row in table.rows ] loop over columns [ table[col_name] for col_name in table.columns ] slice myslice = table['A', 'B', slice(0,None,15)] get column by name my_table['A'] get row by index my_table[9_000_000_001] value update mytable['A'][2] = new value update w. list comprehension mytable['A'] = [ x*x for x in mytable['A'] if x % 2 != 0 ] join a_join = numbers.join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'], kind='left') lookup travel_plan = friends.lookup(bustable, (DataTypes.time(21, 10), \"<=\", 'time'), ('stop', \"==\", 'stop')) groupby group_by = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)]) pivot table my_pivot = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False) index indices = old_table.index(*old_table.columns) sort lookup1_sorted = lookup_1.sort(**{'time': True, 'name':False, \"sort_mode\":'unix'}) filter true, false = unfiltered.filter( [{\"column1\": 'a', \"criteria\":\">=\", 'value2':3}, ... more criteria ... ], filter_type='all' ) find any any_even_rows = mytable.any('A': lambda x : x%2==0, 'B': lambda x > 0) find all all_even_rows = mytable.all('A': lambda x : x%2==0, 'B': lambda x > 0) to json json_str = my_table.to_json() from json Table.from_json(json_str) "},{"location":"#api","title":"API","text":"To view the detailed API see api "},{"location":"#tutorial","title":"Tutorial","text":"To learn more see the tutorial.ipynb (Jupyter notebook) "},{"location":"#latest-updates","title":"Latest updates","text":"See changelog.md "},{"location":"#credits","title":"Credits","text":" - Eugene Antonov - the api documentation.
- Audrius Kulikajevas - Edge case testing / various bugs, Jupyter notebook integration.
- Ovidijus Grigas - various bugs, documentation.
- Martynas Kaunas - GroupBy functionality.
- Sergej Sinkarenko - various bugs.
- Lori Cooper - spell checking.
"},{"location":"benchmarks/","title":"Benchmarks","text":"In\u00a0[2]: Copied! import psutil, os, gc, shutil, tempfile\nfrom pathlib import Path\nfrom time import perf_counter, time\nfrom tablite import Table\nfrom tablite.datasets import synthetic_order_data\nfrom tablite.config import Config\n\nConfig.TQDM_DISABLE = True\n import psutil, os, gc, shutil, tempfile from pathlib import Path from time import perf_counter, time from tablite import Table from tablite.datasets import synthetic_order_data from tablite.config import Config Config.TQDM_DISABLE = True In\u00a0[3]: Copied! process = psutil.Process(os.getpid())\n\ndef make_tables(sizes=[1,2,5,10,20,50]):\n # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.\n t = synthetic_order_data(Config.PAGE_SIZE)\n real, flat = t.nbytes()\n print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\")\n\n tables = [t] # 1M rows.\n\n last = 1\n t2 = t.copy()\n for i in sizes[1:]:\n t2 = t2.copy()\n for _ in range(i-last):\n t2 += synthetic_order_data(Config.PAGE_SIZE) # these are all unique\n last = i\n real, flat = t2.nbytes()\n tables.append(t2)\n print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\")\n return tables\n\ntables = make_tables()\n process = psutil.Process(os.getpid()) def make_tables(sizes=[1,2,5,10,20,50]): # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them. t = synthetic_order_data(Config.PAGE_SIZE) real, flat = t.nbytes() print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\") tables = [t] # 1M rows. last = 1 t2 = t.copy() for i in sizes[1:]: t2 = t2.copy() for _ in range(i-last): t2 += synthetic_order_data(Config.PAGE_SIZE) # these are all unique last = i real, flat = t2.nbytes() tables.append(t2) print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\") return tables tables = make_tables() Table 1,000,000 rows is 256 Mb on disk\nTable 2,000,000 rows is 512 Mb on disk\nTable 5,000,000 rows is 1,280 Mb on disk\nTable 10,000,000 rows is 2,560 Mb on disk\nTable 20,000,000 rows is 5,120 Mb on disk\nTable 50,000,000 rows is 12,800 Mb on disk\n The values in the tables above are all unique! In\u00a0[4]: Copied! tables[-1]\n tables[-1] Out[4]: ~#1234567891011 0114014953182952021-10-06T00:00:0050814119375C3-4HGQ21\u00b0XYZ1.244647268201734421.367107051830455 129320231372182021-08-26T00:00:005007718568C5-5FZU0\u00b00.55294485347516132.6980406874392537 2312569602250812021-12-21T00:00:0050197029074C2-3GTK6\u00b0XYZ1.99739754559065617.513164305723787 3414012777817432021-08-23T00:00:0050818024969C4-3BYP6\u00b0XYZ0.047497125538289577.388171617130485 459426667674262021-07-31T00:00:0050307113074C5-2CCC21\u00b0ABC1.0219215027612885.21324123446987 5612186131851272021-12-01T00:00:0050484117249C5-4WGT21\u00b00.2038764258434556712.190974436133764 676070424343982021-11-29T00:00:0050578011564C2-3LUL0\u00b0XYZ2.2367835158480444.340628097363572.......................................49,999,9939999946602693775472021-09-17T00:00:005015409706C4-3AHQ21\u00b0XYZ0.083216645843125856.56780297752790549,999,9949999955709798646952021-08-01T00:00:0050149125006C1-2FWH6\u00b01.04763923662266419.50710544462706549,999,9959999963551956078252021-07-29T00:00:0050007026992C4-3GVG21\u00b02.20440816560941411.2706443974284949,999,99699999720762240577282021-10-16T00:00:0050950113339C5-4NKS0\u00b02.1593110498135494.21575620046596149,999,9979999986577247891352021-12-21T00:00:0050069114747C2-4LYGNone1.64809640191698683.094420483625827349,999,9989999999775312438842021-12-02T00:00:0050644129345C2-5DRH6\u00b02.30911421692753110.82706867207146849,999,999100000012290713920652021-08-23T00:00:0050706119732C4-5AGB6\u00b00.488871405593691630.8580085696389939 In\u00a0[5]: Copied! def save_load_benchmarks(tables):\n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True)\n\n results = Table()\n results.add_columns('rows', 'save (sec)', 'load (sec)')\n for t in tables:\n fn = tmp / f'{len(t)}.tpz'\n start = perf_counter()\n t.save(fn)\n end = perf_counter()\n save = round(end-start,3)\n assert fn.exists()\n \n \n start = perf_counter()\n t2 = Table.load(fn)\n end = perf_counter()\n load = round(end-start,3)\n print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\")\n del t2\n fn.unlink()\n results.add_rows(len(t), save, load)\n \n r = results\n r['save r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ]\n r['load r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])]\n\n return results\n def save_load_benchmarks(tables): tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) results = Table() results.add_columns('rows', 'save (sec)', 'load (sec)') for t in tables: fn = tmp / f'{len(t)}.tpz' start = perf_counter() t.save(fn) end = perf_counter() save = round(end-start,3) assert fn.exists() start = perf_counter() t2 = Table.load(fn) end = perf_counter() load = round(end-start,3) print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\") del t2 fn.unlink() results.add_rows(len(t), save, load) r = results r['save r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ] r['load r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])] return results In\u00a0[6]: Copied! slb = save_load_benchmarks(tables)\n slb = save_load_benchmarks(tables) saving 1,000,000 rows (49 Mb) took 2.148 seconds. loading took 0.922 seconds\nsaving 2,000,000 rows (98 Mb) took 4.267 seconds. loading took 1.820 seconds\nsaving 5,000,000 rows (246 Mb) took 10.618 seconds. loading took 4.482 seconds\nsaving 10,000,000 rows (492 Mb) took 21.291 seconds. loading took 8.944 seconds\nsaving 20,000,000 rows (984 Mb) took 42.603 seconds. loading took 17.821 seconds\nsaving 50,000,000 rows (2,461 Mb) took 106.644 seconds. loading took 44.600 seconds\n In\u00a0[7]: Copied! slb\n slb Out[7]: #rowssave (sec)load (sec)save r/secload r/sec 010000002.1480.9224655491084598 120000004.2671.824687131098901 2500000010.6184.4824708981115573 31000000021.2918.9444696821118067 42000000042.60317.8214694501122271 550000000106.64444.64688491121076 With various compression options In\u00a0[8]: Copied! def save_compression_benchmarks(t):\n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True)\n\n import zipfile # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile\n methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")]\n methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)]\n methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)]\n\n results = Table()\n results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')\n for level, method, name in methods:\n fn = tmp / f'{len(t)}.tpz'\n start = perf_counter() \n t.save(fn, compression_method=method, compression_level=level)\n end = perf_counter()\n write = round(end-start,3)\n assert fn.exists()\n size = int(fn.stat().st_size/1e6)\n # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='')\n \n start = perf_counter()\n t2 = Table.load(fn)\n end = perf_counter()\n read = round(end-start,3)\n # print(f\" and {end-start:,.3} seconds to load\")\n print(\".\", end='')\n \n del t2\n fn.unlink()\n results.add_rows(size, f\"{name}(level={level})\", write, read)\n \n \n r = results\n r.sort({'write (sec)':True})\n r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]\n r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]\n return results\n def save_compression_benchmarks(t): tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) import zipfile # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")] methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)] methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)] results = Table() results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)') for level, method, name in methods: fn = tmp / f'{len(t)}.tpz' start = perf_counter() t.save(fn, compression_method=method, compression_level=level) end = perf_counter() write = round(end-start,3) assert fn.exists() size = int(fn.stat().st_size/1e6) # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='') start = perf_counter() t2 = Table.load(fn) end = perf_counter() read = round(end-start,3) # print(f\" and {end-start:,.3} seconds to load\") print(\".\", end='') del t2 fn.unlink() results.add_rows(size, f\"{name}(level={level})\", write, read) r = results r.sort({'write (sec)':True}) r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']] r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']] return results In\u00a0[9]: Copied! scb = save_compression_benchmarks(tables[0])\n scb = save_compression_benchmarks(tables[0]) ..................... creating sort index: 0%| | 0/1 [00:00<?, ?it/s]\rcreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 268.92it/s]\n In\u00a0[10]: Copied! scb[0:20]\n scb[0:20] Out[10]: #file size (Mb)methodwrite (sec)read (sec)write (rps)read (rps) 0256zip stored(level=None)0.3960.47525252522105263 129zip lzma(level=None)95.1372.22810511448833 2256zip deflated(level=0)0.5350.59518691581680672 349zip deflated(level=1)2.150.9224651161084598 447zip deflated(level=2)2.2640.9124416961096491 543zip deflated(level=3)3.0490.833279761204819 644zip deflated(level=4)2.920.8623424651160092 742zip deflated(level=5)4.0340.8692478921150747 840zip deflated(level=6)8.5580.81168491250000 939zip deflated(level=7)13.6950.7787301912853471038zip deflated(level=8)56.9720.7921755212626261138zip deflated(level=9)122.6230.791815512642221229zip bzip2(level=1)15.1214.065661332460021329zip bzip2(level=2)16.0474.214623162373041429zip bzip2(level=3)16.8584.409593192268081529zip bzip2(level=4)17.6485.141566631945141629zip bzip2(level=5)18.6746.009535501664171729zip bzip2(level=6)19.4056.628515331508751829zip bzip2(level=7)19.9546.714501151489421929zip bzip2(level=8)20.5956.96148555143657 Conclusions - Fastest: zip stored with no compression takes handles
In\u00a0[11]: Copied! def to_sql_benchmark(t, rows=1_000_000):\n t2 = t[:rows]\n write_start = time()\n _ = t2.to_sql(name='1')\n write_end = time()\n write = round(write_end-write_start,3)\n return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" )\n def to_sql_benchmark(t, rows=1_000_000): t2 = t[:rows] write_start = time() _ = t2.to_sql(name='1') write_end = time() write = round(write_end-write_start,3) return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" ) In\u00a0[12]: Copied! def to_json_benchmark(t, rows=1_000_000):\n t2 = t[:rows]\n\n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True)\n path = tmp / \"1.json\" \n \n write_start = time()\n bytestr = t2.to_json()\n with path.open('w') as fo:\n fo.write(bytestr)\n write_end = time()\n write = round(write_end-write_start,3)\n\n read_start = time()\n with path.open('r') as fi:\n _ = Table.from_json(fi.read()) # <-- JSON\n read_end = time()\n read = round(read_end-read_start,3)\n\n return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" )\n def to_json_benchmark(t, rows=1_000_000): t2 = t[:rows] tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) path = tmp / \"1.json\" write_start = time() bytestr = t2.to_json() with path.open('w') as fo: fo.write(bytestr) write_end = time() write = round(write_end-write_start,3) read_start = time() with path.open('r') as fi: _ = Table.from_json(fi.read()) # <-- JSON read_end = time() read = round(read_end-read_start,3) return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" ) In\u00a0[13]: Copied! def f(t, args):\n rows, c1, c1_kw, c2, c2_kw = args\n t2 = t[:rows]\n\n call = getattr(t2, c1)\n assert callable(call)\n\n write_start = time()\n call(**c1_kw)\n write_end = time()\n write = round(write_end-write_start,3)\n\n for _ in range(10):\n gc.collect()\n\n read_start = time()\n if callable(c2):\n c2(**c2_kw)\n read_end = time()\n read = round(read_end-read_start,3)\n\n fn = c2_kw['path']\n assert fn.exists()\n fs = int(fn.stat().st_size/1e6)\n config = {k:v for k,v in c2_kw.items() if k!= 'path'}\n\n return ( c1, write, read, len(t2), fs , str(config))\n def f(t, args): rows, c1, c1_kw, c2, c2_kw = args t2 = t[:rows] call = getattr(t2, c1) assert callable(call) write_start = time() call(**c1_kw) write_end = time() write = round(write_end-write_start,3) for _ in range(10): gc.collect() read_start = time() if callable(c2): c2(**c2_kw) read_end = time() read = round(read_end-read_start,3) fn = c2_kw['path'] assert fn.exists() fs = int(fn.stat().st_size/1e6) config = {k:v for k,v in c2_kw.items() if k!= 'path'} return ( c1, write, read, len(t2), fs , str(config)) In\u00a0[14]: Copied! def import_export_benchmarks(tables):\n Config.PROCESSING_MODE = Config.FALSE\n \n t = sorted(tables, key=lambda x: len(x), reverse=True)[0]\n \n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True) \n\n args = [\n ( 100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}),\n ( 50_000, \"to_ods\", {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ), # 50k rows, otherwise MemoryError.\n ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'} ),\n ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n (10_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n ( 1_000_000, \"to_tsv\", {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'} ),\n ( 1_000_000, \"to_text\", {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'} ),\n ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'} ),\n ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'} )\n ]\n\n results = Table()\n results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')\n\n results.add_rows( to_sql_benchmark(t) )\n results.add_rows( to_json_benchmark(t) )\n\n for arg in args:\n if len(t)<arg[0]:\n continue\n print(\".\", end='')\n try:\n results.add_rows( f(t, arg) )\n except MemoryError:\n results.add_rows( arg[1], \"Memory Error\", \"NIL\", args[0], \"NIL\", \"N/A\")\n \n r = results\n r['read r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['read (s)']) ]\n r['write r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['write (s)'])]\n\n shutil.rmtree(tmp)\n return results\n def import_export_benchmarks(tables): Config.PROCESSING_MODE = Config.FALSE t = sorted(tables, key=lambda x: len(x), reverse=True)[0] tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) args = [ ( 100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}), ( 50_000, \"to_ods\", {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ), # 50k rows, otherwise MemoryError. ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'} ), ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}), (10_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}), ( 1_000_000, \"to_tsv\", {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'} ), ( 1_000_000, \"to_text\", {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'} ), ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'} ), ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'} ) ] results = Table() results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config') results.add_rows( to_sql_benchmark(t) ) results.add_rows( to_json_benchmark(t) ) for arg in args: if len(t) In\u00a0[15]: Copied! ieb = import_export_benchmarks(tables)\n ieb = import_export_benchmarks(tables) .........writing 12,000,000 records to /tmp/junk/1.hdf5... done\n In\u00a0[16]: Copied! ieb\n ieb Out[16]: #methodwrite (s)read (s)rowssize (Mb)configread r/secwrite r/sec 0to_sql12.34501000000nil81004 1to_json10.8144.406100000014222696392472 2to_xlsx10.56921.5721000009{'sheet': 'pyexcel_sheet1'}46359461 3to_ods29.17529.487500003{'sheet': 'pyexcel_sheet1'}16951713 4to_csv14.31515.7311000000108{}6356869856 5to_csv14.4388.1691000000108{'guess_datatypes': False}12241469261 6to_csv140.64599.45100000001080{'guess_datatypes': False}10055371100 7to_tsv13.83415.7631000000108{}6343972285 8to_text13.93715.6821000000108{}6376771751 9to_html12.5780.531000000228{}18867927950310to_hdf55.0112.3451000000316{}81004199600 Conclusions Best: - to/from JSON wins with 2.3M rps read
- to/from CSV/TSV/TEXT comes 2nd with config
guess_datatypes=False with ~ 100k rps Worst: - to/from ods burst the memory footprint and hence had to be reduced to 100k rows. It also had the slowest read rate with 1450 rps.
In\u00a0[17]: Copied! def contains_benchmark(table):\n results = Table()\n results.add_columns( \"column\", \"time (s)\" )\n for name,col in table.columns.items():\n n = len(col)\n start,stop,step = int(n*0.02), int(n*0.98), int(n/100)\n selection = col[start:stop:step]\n total_time = 0.0\n for v in selection:\n start_time = perf_counter()\n v in col # <--- test!\n end_time = perf_counter()\n total_time += (end_time - start_time)\n avg_time = total_time / len(selection)\n results.add_rows( name, round(avg_time,3) )\n\n return results\n def contains_benchmark(table): results = Table() results.add_columns( \"column\", \"time (s)\" ) for name,col in table.columns.items(): n = len(col) start,stop,step = int(n*0.02), int(n*0.98), int(n/100) selection = col[start:stop:step] total_time = 0.0 for v in selection: start_time = perf_counter() v in col # <--- test! end_time = perf_counter() total_time += (end_time - start_time) avg_time = total_time / len(selection) results.add_rows( name, round(avg_time,3) ) return results In\u00a0[18]: Copied! has_it = contains_benchmark(tables[-1])\nhas_it\n has_it = contains_benchmark(tables[-1]) has_it Out[18]: #columntime (s) 0#0.001 110.043 220.032 330.001 440.001 550.001 660.006 770.003 880.006 990.00710100.04311110.655 In\u00a0[19]: Copied! def slicing_benchmark(table):\n n = len(table)\n start,stop,step = int(0.02*n), int(0.98*n), int(n / 20) # from 2% to 98% in 20 large steps\n start_time = perf_counter()\n snip = table[start:stop:step]\n end_time = perf_counter()\n print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\")\n return snip\n def slicing_benchmark(table): n = len(table) start,stop,step = int(0.02*n), int(0.98*n), int(n / 20) # from 2% to 98% in 20 large steps start_time = perf_counter() snip = table[start:stop:step] end_time = perf_counter() print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\") return snip In\u00a0[20]: Copied! slice_it = slicing_benchmark(tables[-1])\n slice_it = slicing_benchmark(tables[-1]) reading 50,000,000 rows to find 20 rows took 1.435 sec\n In\u00a0[22]: Copied! def column_selection_benchmark(tables):\n results = Table()\n results.add_columns( 'rows')\n results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)])\n\n for table in tables:\n rr = [len(table)]\n for ix, name in enumerate(table.columns):\n cols = list(table.columns)[:ix+1]\n start_time = perf_counter()\n table[cols]\n end_time = perf_counter()\n rr.append(f\"{end_time-start_time:.5f}\")\n results.add_rows( rr )\n return results\n def column_selection_benchmark(tables): results = Table() results.add_columns( 'rows') results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)]) for table in tables: rr = [len(table)] for ix, name in enumerate(table.columns): cols = list(table.columns)[:ix+1] start_time = perf_counter() table[cols] end_time = perf_counter() rr.append(f\"{end_time-start_time:.5f}\") results.add_rows( rr ) return results In\u00a0[23]: Copied! csb = column_selection_benchmark(tables)\nprint(\"times below are are in seconds\")\ncsb\n csb = column_selection_benchmark(tables) print(\"times below are are in seconds\") csb times below are are in seconds\n Out[23]: #rowsn cols=1n cols=2n cols=3n cols=4n cols=5n cols=6n cols=7n cols=8n cols=9n cols=10n cols=11n cols=12 010000000.000010.000060.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 120000000.000010.000080.000030.000030.000030.000030.000030.000030.000030.000030.000040.00004 250000000.000010.000050.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 3100000000.000020.000050.000040.000040.000040.000040.000070.000050.000050.000050.000050.00005 4200000000.000030.000060.000050.000050.000050.000050.000060.000060.000060.000060.000060.00006 5500000000.000090.000110.000100.000090.000090.000090.000090.000090.000090.000090.000100.00009 In\u00a0[33]: Copied! def iterrows_benchmark(table):\n results = Table()\n results.add_columns( 'n columns', 'time (s)')\n\n columns = ['1']\n for column in list(table.columns):\n columns.append(column)\n snip = table[columns, slice(500_000,1_500_000)]\n start_time = perf_counter()\n counts = 0\n for row in snip.rows:\n counts += 1\n end_time = perf_counter()\n results.add_rows( len(columns), round(end_time-start_time,3))\n\n return results\n def iterrows_benchmark(table): results = Table() results.add_columns( 'n columns', 'time (s)') columns = ['1'] for column in list(table.columns): columns.append(column) snip = table[columns, slice(500_000,1_500_000)] start_time = perf_counter() counts = 0 for row in snip.rows: counts += 1 end_time = perf_counter() results.add_rows( len(columns), round(end_time-start_time,3)) return results In\u00a0[34]: Copied! iterb = iterrows_benchmark(tables[-1])\niterb\n iterb = iterrows_benchmark(tables[-1]) iterb Out[34]: #n columnstime (s) 029.951 139.816 249.859 359.93 469.985 579.942 689.958 799.867 8109.96 9119.93210129.8311139.861 In\u00a0[35]: Copied! import matplotlib.pyplot as plt\nplt.plot(iterb['n columns'], iterb['time (s)'])\nplt.show()\n import matplotlib.pyplot as plt plt.plot(iterb['n columns'], iterb['time (s)']) plt.show() In\u00a0[28]: Copied! tables[-1].types()\n tables[-1].types() Out[28]: {'#': {int: 50000000},\n '1': {int: 50000000},\n '2': {str: 50000000},\n '3': {int: 50000000},\n '4': {int: 50000000},\n '5': {int: 50000000},\n '6': {str: 50000000},\n '7': {str: 50000000},\n '8': {str: 50000000},\n '9': {str: 50000000},\n '10': {float: 50000000},\n '11': {str: 50000000}} In\u00a0[29]: Copied! def dtypes_benchmark(tables):\n dtypes_results = Table()\n dtypes_results.add_columns(\"rows\", \"time (s)\")\n\n for table in tables:\n start_time = perf_counter()\n dt = table.types()\n end_time = perf_counter()\n assert isinstance(dt, dict) and len(dt) != 0\n dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )\n\n return dtypes_results\n def dtypes_benchmark(tables): dtypes_results = Table() dtypes_results.add_columns(\"rows\", \"time (s)\") for table in tables: start_time = perf_counter() dt = table.types() end_time = perf_counter() assert isinstance(dt, dict) and len(dt) != 0 dtypes_results.add_rows( len(table), round(end_time-start_time, 3) ) return dtypes_results In\u00a0[30]: Copied! dtype_b = dtypes_benchmark(tables)\ndtype_b\n dtype_b = dtypes_benchmark(tables) dtype_b Out[30]: #rowstime (s) 010000000.0 120000000.0 250000000.0 3100000000.0 4200000000.0 5500000000.001 In\u00a0[31]: Copied! def any_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n\n for table in tables:\n tmp = [len(table)]\n for column in list(table.columns):\n v = table[column][0]\n start_time = perf_counter()\n _ = table.any(**{column: v})\n end_time = perf_counter() \n tmp.append(round(end_time-start_time,3))\n\n results.add_rows( tmp )\n return results\n def any_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: tmp = [len(table)] for column in list(table.columns): v = table[column][0] start_time = perf_counter() _ = table.any(**{column: v}) end_time = perf_counter() tmp.append(round(end_time-start_time,3)) results.add_rows( tmp ) return results In\u00a0[32]: Copied! anyb = any_benchmark(tables)\nanyb\n anyb = any_benchmark(tables) anyb Out[32]: ~rows#1234567891011 010000000.1330.1330.1780.1330.2920.1470.1690.1430.2270.2590.1460.17 120000000.2680.2630.3430.2650.5670.2940.3350.2750.4640.5230.2890.323 250000000.6690.6530.9140.6691.4360.7230.8380.6941.1741.3350.6780.818 3100000001.3141.351.7451.3362.9021.491.6831.4142.3542.6181.3431.536 4200000002.5562.5343.3372.6025.6452.8273.2252.6464.5145.082.6933.083 5500000006.5716.4238.4556.69914.4847.9897.7986.25910.98912.486.7327.767 In\u00a0[36]: Copied! def all_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n\n for table in tables:\n tmp = [len(table)]\n for column in list(table.columns):\n v = table[column][0]\n start_time = perf_counter()\n _ = table.all(**{column: v})\n end_time = perf_counter() \n tmp.append(round(end_time-start_time,3))\n\n results.add_rows( tmp )\n return results\n def all_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: tmp = [len(table)] for column in list(table.columns): v = table[column][0] start_time = perf_counter() _ = table.all(**{column: v}) end_time = perf_counter() tmp.append(round(end_time-start_time,3)) results.add_rows( tmp ) return results In\u00a0[37]: Copied! allb = all_benchmark(tables)\nallb\n allb = all_benchmark(tables) allb Out[37]: ~rows#1234567891011 010000000.120.1210.1620.1220.2640.1380.1550.1270.2090.2370.1330.151 120000000.2370.2350.3110.2380.520.2660.2970.3410.4510.530.2610.285 250000000.6750.6980.9520.5941.6050.6590.8120.7191.2241.3530.6640.914 3100000001.3141.3321.7071.3323.0911.4631.7811.3662.3582.6381.4091.714 4200000002.5762.3133.112.3965.2072.5732.9212.4034.0414.6582.4632.808 5500000005.8965.827.735.95612.9097.457.275.98110.18311.5766.3727.414 In\u00a0[\u00a0]: Copied! \n In\u00a0[38]: Copied! def unique_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n \n for table in tables:\n length = len(table)\n\n tmp = [len(table)]\n for column in list(table.columns):\n start_time = perf_counter()\n try:\n L = table[column].unique()\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n tmp.append(round(dt,3))\n assert 0 < len(L) <= length \n\n results.add_rows( tmp )\n return results\n def unique_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: length = len(table) tmp = [len(table)] for column in list(table.columns): start_time = perf_counter() try: L = table[column].unique() dt = perf_counter() - start_time except MemoryError: dt = -1 tmp.append(round(dt,3)) assert 0 < len(L) <= length results.add_rows( tmp ) return results In\u00a0[39]: Copied! ubm = unique_benchmark(tables)\nubm\n ubm = unique_benchmark(tables) ubm Out[39]: ~rows#1234567891011 010000000.0220.0810.2480.0440.0160.0610.1150.1360.0960.0850.0940.447 120000000.1760.2710.5050.0870.0310.1240.2290.2790.1980.170.3051.471 250000000.1980.4991.2630.2180.0760.3110.570.6850.4740.4250.5952.744 3100000000.5021.1232.5350.4330.1550.6151.1281.3750.960.851.3165.826 4200000000.9562.3365.0350.8830.3191.2292.2682.7481.9131.7462.73311.883 5500000002.3956.01912.4992.1780.7643.0735.6086.8194.8284.2797.09730.511 In\u00a0[40]: Copied! def index_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n \n for table in tables:\n\n tmp = [len(table)]\n for column in list(table.columns):\n start_time = perf_counter()\n try:\n _ = table.index(column)\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n tmp.append(round(dt,3))\n \n results.add_rows( tmp )\n return results\n def index_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: tmp = [len(table)] for column in list(table.columns): start_time = perf_counter() try: _ = table.index(column) dt = perf_counter() - start_time except MemoryError: dt = -1 tmp.append(round(dt,3)) results.add_rows( tmp ) return results In\u00a0[41]: Copied! ibm = index_benchmark(tables)\nibm\n ibm = index_benchmark(tables) ibm Out[41]: ~rows#1234567891011 010000001.9491.7931.4321.1061.0511.231.3381.4931.4111.3031.9992.325 120000002.8833.5172.8562.2172.1242.4622.6762.9862.7092.6064.0494.461 250000006.3829.0497.0965.6285.3536.3126.6497.5216.716.45910.2710.747 31000000012.55318.50613.9511.33510.72412.50913.3315.05113.50212.89919.76921.999 42000000024.71737.89628.56822.66621.47226.32727.15730.06427.33225.82238.31143.399 55000000063.01697.07772.00755.60954.09961.79768.23675.0769.02266.15299.183109.969 Multi-column index next: In\u00a0[42]: Copied! def multi_column_index_benchmark(tables):\n \n selection = [\"4\", \"7\", \"8\", \"9\"]\n results = Table()\n results.add_columns(\"rows\", *range(1,len(selection)+1))\n \n for table in tables:\n\n tmp = [len(table)]\n for index in range(1,5):\n start_time = perf_counter()\n try:\n _ = table.index(*selection[:index])\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n tmp.append(round(dt,3))\n print('.', end='')\n \n results.add_rows( tmp )\n return results\n def multi_column_index_benchmark(tables): selection = [\"4\", \"7\", \"8\", \"9\"] results = Table() results.add_columns(\"rows\", *range(1,len(selection)+1)) for table in tables: tmp = [len(table)] for index in range(1,5): start_time = perf_counter() try: _ = table.index(*selection[:index]) dt = perf_counter() - start_time except MemoryError: dt = -1 tmp.append(round(dt,3)) print('.', end='') results.add_rows( tmp ) return results In\u00a0[43]: Copied! mcib = multi_column_index_benchmark(tables)\nmcib\n mcib = multi_column_index_benchmark(tables) mcib ........................ Out[43]: #rows1234 010000001.0582.1333.2154.052 120000002.124.2786.5468.328 250000005.30310.8916.69320.793 31000000010.58122.40733.46241.91 42000000021.06445.95467.78184.828 55000000052.347109.551166.6211.053 In\u00a0[44]: Copied! def drop_duplicates_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n \n for table in tables:\n result = [len(table)]\n cols = []\n for name in list(table.columns):\n cols.append(name)\n start_time = perf_counter()\n try:\n _ = table.drop_duplicates(*cols)\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n result.append(round(dt,3))\n print('.', end='')\n \n results.add_rows( result )\n return results\n def drop_duplicates_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: result = [len(table)] cols = [] for name in list(table.columns): cols.append(name) start_time = perf_counter() try: _ = table.drop_duplicates(*cols) dt = perf_counter() - start_time except MemoryError: dt = -1 result.append(round(dt,3)) print('.', end='') results.add_rows( result ) return results In\u00a0[45]: Copied! ddb = drop_duplicates_benchmark(tables)\nddb\n ddb = drop_duplicates_benchmark(tables) ddb ........................................................................ Out[45]: ~rows#1234567891011 010000001.7612.3583.3133.9014.6154.9615.8356.5347.4548.1088.8039.682 120000003.0114.936.9347.979.26410.26812.00613.51714.9216.63117.93219.493 250000006.82713.85318.63721.23724.54827.1131.15735.02638.99243.53146.02250.433 31000000013.23831.74641.14146.91753.17258.24167.99274.65182.7491.45897.666104.82 42000000025.93277.75100.34109.314123.514131.874148.432163.57179.121196.047208.686228.059 55000000064.237312.222364.886388.249429.724466.685494.418535.367581.666607.306634.343683.858"},{"location":"benchmarks/#benchmarks","title":"Benchmarks\u00b6","text":"These benchmarks seek to establish the performance of tablite as a user sees it. Overview Input/Output Various column functions Base functions Core functions - Save / Load .tpz format- Save tables to various formats- Import data from various formats - Setitem / getitem- iter- equal, not equal- copy- t += t- t *= t- contains- remove all- replace- index- unique- histogram- statistics- count - Setitem / getitem- iter / rows- equal, not equal- load- save- copy- stack- types- display_dict- show- to_dict- as_json_serializable- index - expression- filter- sort_index- reindex- drop_duplicates- sort- is_sorted- any- all- drop - replace- groupby- pivot- joins- lookup- replace missing values- transpose- pivot_transpose- diff"},{"location":"benchmarks/#input-output","title":"Input / Output\u00b6","text":""},{"location":"benchmarks/#create-tables-from-synthetic-data","title":"Create tables from synthetic data.\u00b6","text":""},{"location":"benchmarks/#save-load-tpz-format","title":"Save / Load .tpz format\u00b6","text":"Without default compression settings (10% slower than uncompressed, 20% of uncompressed filesize) "},{"location":"benchmarks/#save-load-tables-to-from-various-formats","title":"Save / load tables to / from various formats\u00b6","text":"The handlers for saving / export are: - to_sql
- to_json
- to_xls
- to_ods
- to_csv
- to_tsv
- to_text
- to_html
- to_hdf5
"},{"location":"benchmarks/#various-column-functions","title":"Various column functions\u00b6","text":" - Setitem / getitem
- iter
- equal, not equal
- copy
- t += t
- t *= t
- contains
- remove all
- replace
- index
- unique
- histogram
- statistics
- count
"},{"location":"benchmarks/#various-table-functions","title":"Various table functions\u00b6","text":""},{"location":"benchmarks/#slicing","title":"Slicing\u00b6","text":"Slicing operations are used in many places. "},{"location":"benchmarks/#tabletypes","title":"Table.types()\u00b6","text":"Table.types() is implemented for near constant speed lookup. Here is an example: "},{"location":"benchmarks/#tableany","title":"Table.any\u00b6","text":""},{"location":"benchmarks/#tableall","title":"Table.all\u00b6","text":""},{"location":"benchmarks/#tablefilter","title":"Table.filter\u00b6","text":""},{"location":"benchmarks/#tableunique","title":"Table.unique\u00b6","text":""},{"location":"benchmarks/#tableindex","title":"Table.index\u00b6","text":"Single column index first: "},{"location":"benchmarks/#drop-duplicates","title":"drop duplicates\u00b6","text":""},{"location":"changelog/","title":"Changelog","text":"Version Change 2023.9.0 Adding Table.match operation. 2023.8.0 Nim backend for csv importer.Improve excel importer.Improve slicing consistency.Logical cores re-enabled on *nix based systems.Filter is now type safe.Added merge utility.Various bugfixes. 2023.6.5 Fix issues with get_headers falling back to text reading when reading 0 lines of excel, fix issue where reading excel file would ignore file count, excel file reader now has parity for linecount selection. 2023.6.4 Fix a logic bug in get_headers that caused one extra line to be returned than requested. 2023.6.3 Updated the way reference counting works. Tablite now tracks references to used pages and cleans them up based on number of references to those pages in the current process. This change allows to handle deep table clones when sending tables via processes (pickling/unpickling), whereas previous implementation would corrupt all tables using same pages due to reference counting asserting that all tables are shallow copies to the same object. 2023.6.2 Updated mplite dependency, changed to soft version requirement to prevent pipeline freezes due to small bugfixes in mplite . 2023.6.1 Major change of the backend processes. Speed up of ~6x. For more see the release notes 2022.11.19 Fixed some memory leaks. 2022.11.18 copy , filter , sort , any , all methods now properly respects the table subclass.Filter for tables with under SINGLE_PROCESSING_LIMIT rows will run on same process to reduce overhead.Errors within child processes now properly propagate to parent.Table.reset_storage(include_imports=True) now allows the user to reset the storage but exclude any imported files by setting include_imports=False during Table.reset(...) .Bug: A column with 1,None,2 would be written to csv & tsv as \"1,None,2\" . Now it is written \"1,,2\" where None means absent.Fix mp join producing mismatched columns lengths when different table lengths are used as an input or when join product is longer than the input table. 2022.11.17 Table.load now properly subclassess the table instead of always resulting in tablite.Table .Table.from_* methods now respect subclassess, fixed some from_* methods which were instance methods and not class methods.Fixed Table.from_dict only accepting list and tuple but not tablite.Column which is an equally valid type.Fix lookup parity in single process and multiple process outputs.Fix an issue with multiprocess lookup where no matches would throw instead of producing None .Fix an issue with filtering an empty table. 2022.11.16 Changed join to process 1M rows per task to avoid potential OOM on lower memory systems. Added mp_merge_columns to MemoryManager that merges column pages into a single column.Fix join parity in single process and multiple process outputs.Fix an issue with multiprocess join where no matches would throw instead of producing None . 2022.11.15 Bump mplite to avoid deadlock issues OS kill the process. 2022.11.14 Improve locking mechanism to allow retries when opening file as the previous solution could cause deadlocks when running multiple threads. 2022.11.13 Fix an issue with copying empty pages. 2022.11.12 Tablite now is now able to create it's own temporary directory. 2022.11.11 text_reader tqdm tracks the entire process now. text_reader properly respects free memory in *nix based systems. text_reader no longer discriminates against hyperthreaded cores. 2022.11.10 get_headers now uses plain openpyxl instead of pyexcel wrapper to speed up fetch times ~10x on certain files. 2022.11.9 get_headers can fail safe on unrecognized characters. 2022.11.8 Fix a bug with task size calculation on single core systems. 2022.11.7 Added TABLITE_TMPDIR environment variable for setting tablite work directory. Characters that fail to be read text reader due to improper encoding will be skipped. Fixed an issue where single column text files with no column delimiters would be imported as empty tables. 2022.11.6 Date inference fix 2022.11.5 Fixed negative slicing issues 2022.11.4 Transpose API changes: table.transpose(...) was renamed to table.pivot_transpose(...) new table.transpose() and table.T were added, it's functionality acts similarly to numpy.T , the column headers are used the first row in the table when transposing. 2022.11.3 Bugfix for non-ascii encoded strings during t.add_rows(...) 2022.11.2 As utf-8 is ascii compatible, the file reader utils selects utf-8 instead of ascii as a default. 2022.11.1 bugfix in datatypes.infer() where 1 was inferred as int, not float. 2022.11.0 New table features: Table.diff(other, columns=...) , table.remove_duplicates_rows() , table.drop_na(*arg) ,table.replace(target,replacement) , table.imputation(sources, targets, methods=...) , table.to_pandas() and Table.from_pandas(pd.DataFrame) ,table.to_dict(columns, slice) , Table.from_dict() ,table.transpose(columns, keep, ...) , New column features: Column.count(item) , Column[:] is guaranteed to return a python list.Column.to_numpy(slice) returns np.ndarray . new tools library: from tablite import tools with: date_range(start,end) , xround(value, multiple, up=None) , and, guess as short-cut for Datatypes.guess(...) . bugfixes: __eq__ was updated but missed __ne__ .in operator in filter would crash if datatypes were not strings. 2022.10.11 filter now accepts any expression (str) that can be compiled by pythons compiler 2022.10.11 Bugfix for .any and .all . The code now executes much faster 2022.10.10 Bugfix for Table.import_file : import_as has been removed from keywords. 2022.10.10 All Table functions now have tqdm progressbar. 2022.10.10 More robust calculation for task size for multiprocessing. 2022.10.10 Dependency update: mplite==1.2.0 is now required. 2022.10.9 Bugfix for Table.import_file : files with duplicate header names would only have last duplicate name imported.Now the headers are made unique using name_x where x is a number. 2022.10.8 Bugfix for groupby: Where keys are empty error should have been raised.Where there are no functions, unique keypairs are returned. 2022.10.7 Bugfix for Column.statistics() for an empty column 2022.10.6 Bugfix for __setitem__ : tbl['a'] = [] is now seen as tbl.add_column('a') Bugfix for __getitem__ : calling a missing key raises keyerror. 2022.10.5 Bugfix for summary statistics. 2022.10.4 Bugfix for join shortcut. 2022.10.3 Bugfix for DataTypes where bool was evaluated wrongly 2022.10.0 Added ability to reindex in table.reindex(index=[0,1...,n,n-1]) 2022.9.0 Added ability to store python objects (example).Added warning when user iterates over non-rectangular dataset. 2022.8.0 Added table.export(path) which exports tablite Tables to file format given by the file extension. For example my_table.export('example.xlsx') .supported formats are: json , html , xlsx , xls , csv , tsv , txt , ods and sql . 2022.7.8 Added ability to forward tqdm progressbar into Table.import_file(..., tqdm=your_tqdm) , so that Jupyter notebook can use it in display -methods. 2022.7.7 Added method Table.to_sql() for export to ANSI-92 SQL enginesBugfix on to_json for timedelta . Jupyter notebook provides nice view using Table._repr_html_() JS-users can use .as_json_serializable where suitable. 2022.7.6 get_headers now takes argument (path, linecount=10) 2022.7.5 added helper Table.as_json_serializable as Jupyterkernel compat. 2022.7.4 adder helper Table.to_dict , and updated Table.to_json 2022.7.3 table.to_json now takes kwargs: row_count , columns , slice_ , start_on 2022.7.2 documentation update. 2022.7.1 minor bugfix. 2022.7.0 BREAKING CHANGES- Tablite now uses HDF5 as backend. - Has multiprocessing enabled by default. - Is 20x faster. - Completely new API. 2022.6.0 DataTypes.guess([list of strings]) returns the best matching python datatype."},{"location":"tutorial/","title":"Tutorial","text":"In\u00a0[1]: Copied! from tablite import Table\n\n## To create a tablite table is as simple as populating a dictionary:\nt = Table({'A':[1,2,3], 'B':['a','b','c']})\n from tablite import Table ## To create a tablite table is as simple as populating a dictionary: t = Table({'A':[1,2,3], 'B':['a','b','c']}) In\u00a0[2]: Copied! ## In this notebook we can show tables in the HTML style:\nt\n ## In this notebook we can show tables in the HTML style: t Out[2]: #AB 01a 12b 23c In\u00a0[3]: Copied! ## or the ascii style:\nt.show()\n ## or the ascii style: t.show() +==+=+=+\n|# |A|B|\n+--+-+-+\n| 0|1|a|\n| 1|2|b|\n| 2|3|c|\n+==+=+=+\n In\u00a0[4]: Copied! ## or if you'd like to inspect the table, use:\nprint(str(t))\n ## or if you'd like to inspect the table, use: print(str(t)) Table(2 columns, 3 rows)\n In\u00a0[5]: Copied! ## You can also add all columns at once (slower) if you prefer. \nt2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c')))\nassert t==t2\n ## You can also add all columns at once (slower) if you prefer. t2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c'))) assert t==t2 In\u00a0[6]: Copied! ## or load data:\nt3 = Table.from_file('tests/data/book1.csv')\n\n## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show(). \n## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows.\nt3\n ## or load data: t3 = Table.from_file('tests/data/book1.csv') ## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show(). ## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows. t3 Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 487.82it/s]\n Out[6]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[7]: Copied! ## should you however want to select the headers instead of importing everything\n## (which maybe timeconsuming), simply use get_headers(path)\nfrom tablite.tools import get_headers\nfrom pathlib import Path\npath = Path('tests/data/book1.csv')\nsample = get_headers(path, linecount=5)\nprint(f\"sample is of type {type(sample)} and has the following entries:\")\nfor k,v in sample.items():\n print(k)\n if isinstance(v,list):\n for r in sample[k]:\n print(\"\\t\", r)\n ## should you however want to select the headers instead of importing everything ## (which maybe timeconsuming), simply use get_headers(path) from tablite.tools import get_headers from pathlib import Path path = Path('tests/data/book1.csv') sample = get_headers(path, linecount=5) print(f\"sample is of type {type(sample)} and has the following entries:\") for k,v in sample.items(): print(k) if isinstance(v,list): for r in sample[k]: print(\"\\t\", r) sample is of type <class 'dict'> and has the following entries:\ndelimiter\nbook1.csv\n\t ['a', 'b', 'c', 'd', 'e', 'f']\n\t ['1', '0.060606061', '0.090909091', '0.121212121', '0.151515152', '0.181818182']\n\t ['2', '0.121212121', '0.242424242', '0.484848485', '0.96969697', '1.939393939']\n\t ['3', '0.242424242', '0.484848485', '0.96969697', '1.939393939', '3.878787879']\n\t ['4', '0.484848485', '0.96969697', '1.939393939', '3.878787879', '7.757575758']\n\t ['5', '0.96969697', '1.939393939', '3.878787879', '7.757575758', '15.51515152']\n In\u00a0[8]: Copied! ## to extend a table by adding columns, use t[new] = [new values]\nt['C'] = [4,5,6]\n## but make sure the column has the same length as the rest of the table!\nt\n ## to extend a table by adding columns, use t[new] = [new values] t['C'] = [4,5,6] ## but make sure the column has the same length as the rest of the table! t Out[8]: #ABC 01a4 12b5 23c6 In\u00a0[9]: Copied! ## should you want to mix datatypes, tablite will not complain:\nfrom datetime import datetime, date,time,timedelta\nimport numpy as np\n## What you put in ...\nt4 = Table()\nt4['mixed'] = [\n -1,0,1, # regular integers\n -12345678909876543211234567890987654321, # very very large integer\n None,np.nan, # null values \n \"one\", \"\", # strings\n True,False, # booleans\n float('inf'), 0.01, # floats\n date(2000,1,1), # date\n datetime(2002,2,3,23,0,4,6660), # datetime\n time(12,12,12), # time\n timedelta(days=3, seconds=5678) # timedelta\n]\n## ... is exactly what you get out:\nt4\n ## should you want to mix datatypes, tablite will not complain: from datetime import datetime, date,time,timedelta import numpy as np ## What you put in ... t4 = Table() t4['mixed'] = [ -1,0,1, # regular integers -12345678909876543211234567890987654321, # very very large integer None,np.nan, # null values \"one\", \"\", # strings True,False, # booleans float('inf'), 0.01, # floats date(2000,1,1), # date datetime(2002,2,3,23,0,4,6660), # datetime time(12,12,12), # time timedelta(days=3, seconds=5678) # timedelta ] ## ... is exactly what you get out: t4 Out[9]: #mixed 0-1 10 21 3-12345678909876543211234567890987654321 4None 5nan 6one 7 8True 9False10inf110.01122000-01-01132002-02-03 23:00:04.0066601412:12:12153 days, 1:34:38 In\u00a0[10]: Copied! ## also if you claim the values back as a python list:\nfor item in list(t4['mixed']):\n print(item)\n ## also if you claim the values back as a python list: for item in list(t4['mixed']): print(item) -1\n0\n1\n-12345678909876543211234567890987654321\nNone\nnan\none\n\nTrue\nFalse\ninf\n0.01\n2000-01-01\n2002-02-03 23:00:04.006660\n12:12:12\n3 days, 1:34:38\n The column itself (__repr__ ) shows us the pid , file location and the entries, so you know exactly what you're working with. In\u00a0[11]: Copied! t4['mixed']\n t4['mixed'] Out[11]: Column(/tmp/tablite-tmp/pid-54911, [-1 0 1 -12345678909876543211234567890987654321 None nan 'one' '' True\n False inf 0.01 datetime.date(2000, 1, 1)\n datetime.datetime(2002, 2, 3, 23, 0, 4, 6660) datetime.time(12, 12, 12)\n datetime.timedelta(days=3, seconds=5678)]) In\u00a0[12]: Copied! ## to view the datatypes in a column, use Column.types()\ntype_dict = t4['mixed'].types()\nfor k,v in type_dict.items():\n print(k,v)\n ## to view the datatypes in a column, use Column.types() type_dict = t4['mixed'].types() for k,v in type_dict.items(): print(k,v) <class 'int'> 4\n<class 'NoneType'> 1\n<class 'float'> 3\n<class 'str'> 2\n<class 'bool'> 2\n<class 'datetime.date'> 1\n<class 'datetime.datetime'> 1\n<class 'datetime.time'> 1\n<class 'datetime.timedelta'> 1\n In\u00a0[13]: Copied! ## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file.\n## This is because tablite guesses the most probable datatype using the `.guess` function on each column.\n## You can use the .guess function like this:\nfrom tablite import DataTypes\nt3['a'] = DataTypes.guess(t3['a'])\n## You can also convert the datatype using a list comprehension\nt3['b'] = [float(v) for v in t3['b']]\nt3\n ## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file. ## This is because tablite guesses the most probable datatype using the `.guess` function on each column. ## You can use the .guess function like this: from tablite import DataTypes t3['a'] = DataTypes.guess(t3['a']) ## You can also convert the datatype using a list comprehension t3['b'] = [float(v) for v in t3['b']] t3 Out[13]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[14]: Copied! t = Table()\nfor column_name in 'abcde':\n t[column_name] =[i for i in range(5)]\n t = Table() for column_name in 'abcde': t[column_name] =[i for i in range(5)] (2) we want to add two new columns using the functions: In\u00a0[15]: Copied! def f1(a,b,c):\n return a+b+c+1\ndef f2(b,c,d):\n return b*c*d\n def f1(a,b,c): return a+b+c+1 def f2(b,c,d): return b*c*d (3) and we want to compute two new columns f and g : In\u00a0[16]: Copied! t.add_columns('f', 'g')\n t.add_columns('f', 'g') (4) we can now use the filter, to iterate over the table, and add the values to the two new columns: In\u00a0[17]: Copied! f,g=[],[]\nfor row in t['a', 'b', 'c', 'd'].rows:\n a, b, c, d = row\n\n f.append(f1(a, b, c))\n g.append(f2(b, c, d))\nt['f'] = f\nt['g'] = g\n\nassert len(t) == 5\nassert list(t.columns) == list('abcdefg')\nt\n f,g=[],[] for row in t['a', 'b', 'c', 'd'].rows: a, b, c, d = row f.append(f1(a, b, c)) g.append(f2(b, c, d)) t['f'] = f t['g'] = g assert len(t) == 5 assert list(t.columns) == list('abcdefg') t Out[17]: #abcdefg 00000010 11111141 22222278 3333331027 4444441364 Take note that if your dataset is assymmetric, a warning will be show: In\u00a0[18]: Copied! assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]})\nfor row in assymmetric_table.rows:\n print(row)\n## warning at the bottom ---v\n assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]}) for row in assymmetric_table.rows: print(row) ## warning at the bottom ---v [1, 1]\n[2, 2]\n[3, None]\n /home/bjorn/github/tablite/tablite/base.py:1188: UserWarning: Column b has length 2 / 3. None will appear as fill value.\n warnings.warn(f\"Column {name} has length {len(column)} / {n_max}. None will appear as fill value.\")\n In\u00a0[19]: Copied! table7 = Table(columns={\n'A': [1,1,2,2,3,4],\n'B': [1,1,2,2,30,40],\n'C': [-1,-2,-3,-4,-5,-6]\n})\nindex = table7.index('A', 'B')\nfor k, v in index.items():\n print(\"key\", k, \"indices\", v)\n table7 = Table(columns={ 'A': [1,1,2,2,3,4], 'B': [1,1,2,2,30,40], 'C': [-1,-2,-3,-4,-5,-6] }) index = table7.index('A', 'B') for k, v in index.items(): print(\"key\", k, \"indices\", v) key (1, 1) indices [0, 1]\nkey (2, 2) indices [2, 3]\nkey (3, 30) indices [4]\nkey (4, 40) indices [5]\n The keys are created for each unique column-key-pair, and the value is the index where the key is found. To fetch all rows for key (2,2) , we can use: In\u00a0[20]: Copied! for ix, row in enumerate(table7.rows):\n if ix in index[(2,2)]:\n print(row)\n for ix, row in enumerate(table7.rows): if ix in index[(2,2)]: print(row) [2, 2, -3]\n[2, 2, -4]\n In\u00a0[21]: Copied! ## to append one table to another, use + or += \nprint('length before:', len(t3)) # length before: 45\nt5 = t3 + t3 \nprint('length after +', len(t5)) # length after + 90\nt5 += t3 \nprint('length after +=', len(t5)) # length after += 135\n## if you need a lot of numbers for a test, you can repeat a table using * and *=\nt5 *= 1_000\nprint('length after +=', len(t5)) # length after += 135000\n ## to append one table to another, use + or += print('length before:', len(t3)) # length before: 45 t5 = t3 + t3 print('length after +', len(t5)) # length after + 90 t5 += t3 print('length after +=', len(t5)) # length after += 135 ## if you need a lot of numbers for a test, you can repeat a table using * and *= t5 *= 1_000 print('length after +=', len(t5)) # length after += 135000 length before: 45\nlength after + 90\nlength after += 135\nlength after += 135000\n In\u00a0[22]: Copied! t5\n t5 Out[22]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606..................... 134,9933916659267088.033318534175.066637068350.0133274000000.0266548000000.0 134,9944033318534175.066637068350.0133274000000.0266548000000.0533097000000.0 134,9954166637068350.0133274000000.0266548000000.0533097000000.01066190000000.0 134,99642133274000000.0266548000000.0533097000000.01066190000000.02132390000000.0 134,99743266548000000.0533097000000.01066190000000.02132390000000.04264770000000.0 134,99844533097000000.01066190000000.02132390000000.04264770000000.08529540000000.0 134,999451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[23]: Copied! ## if your are in doubt whether your tables will be the same you can use .stack(other)\nassert t.columns != t2.columns # compares list of column names.\nt6 = t.stack(t2)\nt6\n ## if your are in doubt whether your tables will be the same you can use .stack(other) assert t.columns != t2.columns # compares list of column names. t6 = t.stack(t2) t6 Out[23]: #abcdefgAB 00000010NoneNone 11111141NoneNone 22222278NoneNone 3333331027NoneNone 4444441364NoneNone 5NoneNoneNoneNoneNoneNoneNone1a 6NoneNoneNoneNoneNoneNoneNone2b 7NoneNoneNoneNoneNoneNoneNone3c In\u00a0[24]: Copied! ## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns.\n\n## if you need a more detailed view of the columns you can iterate:\nfor name in t.columns:\n col_from_t = t[name]\n if name in t2.columns:\n col_from_t2 = t2[name]\n print(name, col_from_t == col_from_t2)\n else:\n print(name, \"not in t2\")\n ## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns. ## if you need a more detailed view of the columns you can iterate: for name in t.columns: col_from_t = t[name] if name in t2.columns: col_from_t2 = t2[name] print(name, col_from_t == col_from_t2) else: print(name, \"not in t2\") a not in t2\nb not in t2\nc not in t2\nd not in t2\ne not in t2\nf not in t2\ng not in t2\n In\u00a0[25]: Copied! ## to make a copy of a table, use table.copy()\nt3_copy = t3.copy()\n\n## you can also perform multi criteria selections using getitem [ ... ]\nt3_slice = t3['a','b','d', 5:25:5]\nt3_slice\n ## to make a copy of a table, use table.copy() t3_copy = t3.copy() ## you can also perform multi criteria selections using getitem [ ... ] t3_slice = t3['a','b','d', 5:25:5] t3_slice Out[25]: #abd 061.9393939397.757575758 11162.06060606248.2424242 2161985.9393947943.757576 32163550.06061254200.2424 In\u00a0[26]: Copied! ##deleting items also works the same way:\ndel t3_slice[1:3] # delete row number 2 & 3 \nt3_slice\n ##deleting items also works the same way: del t3_slice[1:3] # delete row number 2 & 3 t3_slice Out[26]: #abd 061.9393939397.757575758 12163550.06061254200.2424 In\u00a0[27]: Copied! ## to wipe a table, use .clear:\nt3_slice.clear()\nt3_slice\n ## to wipe a table, use .clear: t3_slice.clear() t3_slice Out[27]: Empty Table In\u00a0[28]: Copied! ## tablite uses .npy for storage because it is fast.\n## this means you can make a table persistent using .save\nlocal_file = Path(\"local_file.tpz\")\nt5.save(local_file)\n\nold_t5 = Table.load(local_file)\nprint(\"the t5 table had\", len(old_t5), \"rows\") # the t5 table had 135000 rows\n\ndel old_t5 # only removes the in-memory object\n\nprint(\"old_t5 still exists?\", local_file.exists())\nprint(\"path:\", local_file)\n\nimport os\nos.remove(local_file)\n ## tablite uses .npy for storage because it is fast. ## this means you can make a table persistent using .save local_file = Path(\"local_file.tpz\") t5.save(local_file) old_t5 = Table.load(local_file) print(\"the t5 table had\", len(old_t5), \"rows\") # the t5 table had 135000 rows del old_t5 # only removes the in-memory object print(\"old_t5 still exists?\", local_file.exists()) print(\"path:\", local_file) import os os.remove(local_file) loading 'local_file.tpz' file: 55%|\u2588\u2588\u2588\u2588\u2588\u258d | 9851/18000 [00:02<00:01, 4386.96it/s] loading 'local_file.tpz' file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18000/18000 [00:04<00:00, 4417.27it/s]\n the t5 table had 135000 rows\nold_t5 still exists? True\npath: local_file.tpz\n If you want to save a table from one session to another use save=True . This tells the garbage collector to leave the tablite Table on disk, so you can load it again without changing your code. For example: First time you run t = Table.import_file(....big.csv) it may take a minute or two. If you then add t.save=True and restart python, the second time you run t = Table.import_file(....big.csv) it will take a few milliseconds instead of minutes. In\u00a0[29]: Copied! unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]})\n unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]}) In\u00a0[30]: Copied! true,false = unfiltered.filter(\n [\n {\"column1\": 'a', \"criteria\":\">=\", 'value2':3}\n ], filter_type='all'\n)\n true,false = unfiltered.filter( [ {\"column1\": 'a', \"criteria\":\">=\", 'value2':3} ], filter_type='all' ) In\u00a0[31]: Copied! true\n true Out[31]: #ab 0330 1440 In\u00a0[32]: Copied! false.show() # using show here to show that terminal users can have a nice view too.\n false.show() # using show here to show that terminal users can have a nice view too. +==+=+==+\n|# |a|b |\n+--+-+--+\n| 0|1|10|\n| 1|2|20|\n+==+=+==+\n In\u00a0[33]: Copied! ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]})\n ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]}) In\u00a0[34]: Copied! ## typical python\nany(i > 3 for i in ty['a'])\n ## typical python any(i > 3 for i in ty['a']) Out[34]: True In\u00a0[35]: Copied! ## hereby you can do:\nany( ty.any(**{'a':lambda x:x>3}).rows )\n ## hereby you can do: any( ty.any(**{'a':lambda x:x>3}).rows ) Out[35]: True In\u00a0[36]: Copied! ## if you have multiple criteria this also works:\nall( ty.all(**{'a': lambda x:x>=2, 'b': lambda x:x<=30}).rows )\n ## if you have multiple criteria this also works: all( ty.all(**{'a': lambda x:x>=2, 'b': lambda x:x<=30}).rows ) Out[36]: True In\u00a0[37]: Copied! ## or this if you want to see the table.\nty.all(a=lambda x:x>2, b=lambda x:x<=30)\n ## or this if you want to see the table. ty.all(a=lambda x:x>2, b=lambda x:x<=30) Out[37]: #ab 0330 In\u00a0[38]: Copied! ## As `all` and `any` returns tables, this also means that you can chain operations:\nty.any(a=lambda x:x>2).any(b=30)\n ## As `all` and `any` returns tables, this also means that you can chain operations: ty.any(a=lambda x:x>2).any(b=30) Out[38]: #ab 0330 In\u00a0[39]: Copied! table = Table({\n 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9],\n 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10],\n 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0],\n})\ntable\n table = Table({ 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9], 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10], 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0], }) table Out[39]: #ABC 01100 1None1001 2810 3311 4410 5611 65100 77101 89100 In\u00a0[40]: Copied! sort_order = {'B': False, 'C': False, 'A': False}\nassert not table.is_sorted(mapping=sort_order)\n\nsorted_table = table.sort(mapping=sort_order)\nsorted_table\n sort_order = {'B': False, 'C': False, 'A': False} assert not table.is_sorted(mapping=sort_order) sorted_table = table.sort(mapping=sort_order) sorted_table creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 2719.45it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 3434.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 1902.47it/s]\n Sort is reasonable effective as it uses multiprocessing above a million fields. Hint: You can set this limit in tablite.config , like this: In\u00a0[41]: Copied! from tablite.config import Config\nprint(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\")\n from tablite.config import Config print(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\") multiprocessing is used above 1,000,000 fields\n In\u00a0[42]: Copied! import math\nn = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n,\n 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n,\n})\ntable\n import math n = math.ceil(1_000_000 / (9*3)) table = Table({ 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n, 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n, 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n, }) table Out[42]: #ABC 01100 1None1001 2810 3311 4410 5611 65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[43]: Copied! import time as cputime\nstart = cputime.time()\nsort_order = {'B': False, 'C': False, 'A': False}\nsorted_table = table.sort(mapping=sort_order) # sorts 1M values.\nprint(\"table sorting took \", round(cputime.time() - start,3), \"secs\")\nsorted_table\n import time as cputime start = cputime.time() sort_order = {'B': False, 'C': False, 'A': False} sorted_table = table.sort(mapping=sort_order) # sorts 1M values. print(\"table sorting took \", round(cputime.time() - start,3), \"secs\") sorted_table creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 4.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 18.17it/s] table sorting took 0.913 secs\n \n In\u00a0[44]: Copied! n = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n,\n 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n,\n})\ntable\n n = math.ceil(1_000_000 / (9*3)) table = Table({ 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n, 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n, 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n, }) table Out[44]: #ABC 01100 1None1001 2810 3311 4410 5611 65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[45]: Copied! from tablite import GroupBy as gb\ngrpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)])\ngrpby\n from tablite import GroupBy as gb grpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)]) grpby groupby: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 333342/333342 [00:00<00:00, 427322.50it/s]\n Out[45]: #CBCount(A) 0010111114 1110037038 20174076 31174076 411037038 Here is the list of groupby functions: class GroupBy(object): \n max = Max # shortcuts to avoid having to type a long list of imports.\n min = Min\n sum = Sum\n product = Product\n first = First\n last = Last\n count = Count\n count_unique = CountUnique\n avg = Average\n stdev = StandardDeviation\n median = Median\n mode = Mode\n In\u00a0[46]: Copied! t = Table({\n 'A':[1, 1, 2, 2, 3, 3] * 2,\n 'B':[1, 2, 3, 4, 5, 6] * 2,\n 'C':[6, 5, 4, 3, 2, 1] * 2,\n})\nt\n t = Table({ 'A':[1, 1, 2, 2, 3, 3] * 2, 'B':[1, 2, 3, 4, 5, 6] * 2, 'C':[6, 5, 4, 3, 2, 1] * 2, }) t Out[46]: #ABC 0116 1125 2234 3243 4352 5361 6116 7125 8234 92431035211361 In\u00a0[47]: Copied! t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False)\nt2\n t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False) t2 pivot: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 14/14 [00:00<00:00, 3643.83it/s]\n Out[47]: #CSum(B,A=1)Count(B,A=1)Sum(B,A=2)Count(B,A=2)Sum(B,A=3)Count(B,A=3) 0622NoneNoneNoneNone 1542NoneNoneNoneNone 24NoneNone62NoneNone 33NoneNone82NoneNone 42NoneNoneNoneNone102 51NoneNoneNoneNone122 In\u00a0[48]: Copied! numbers = Table()\nnumbers.add_column('number', data=[ 1, 2, 3, 4, None])\nnumbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue'])\n\nletters = Table()\nletters.add_column('letter', data=[ 'a', 'b', 'c', 'd', None])\nletters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue'])\n numbers = Table() numbers.add_column('number', data=[ 1, 2, 3, 4, None]) numbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue']) letters = Table() letters.add_column('letter', data=[ 'a', 'b', 'c', 'd', None]) letters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue']) In\u00a0[49]: Copied! ## left join\n## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nleft_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nleft_join\n ## left join ## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color left_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) left_join join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1221.94it/s]\n Out[49]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d In\u00a0[50]: Copied! ## inner join\n## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\ninner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\ninner_join\n ## inner join ## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color inner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) inner_join join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1121.77it/s]\n Out[50]: #numberletter 02a 12None 2Nonea 3NoneNone 43b 53d 64b 74d In\u00a0[51]: Copied! # outer join\n## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nouter_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nouter_join\n # outer join ## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color outer_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) outer_join join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1585.15it/s]\n Out[51]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d 9Nonec Q: But ...I think there's a bug in the join... A: Venn diagrams do not explain joins. A Venn diagram is a widely-used diagram style that shows the logical relation between sets, popularised by John Venn in the 1880s. The diagrams are used to teach elementary set theory, and to illustrate simple set relationshipssource: en.wikipedia.org Joins operate over rows and when there are duplicate rows, these will be replicated in the output. Many beginners are surprised by this, because they didn't read the SQL standard. Q: So what do I do? A: If you want to get rid of duplicates using tablite, use the index functionality across all columns and pick the first row from each index. Here's the recipe that starts with plenty of duplicates: In\u00a0[52]: Copied! old_table = Table({\n'A':[1,1,1,2,2,2,3,3,3],\n'B':[1,1,4,2,2,5,3,3,6],\n})\nold_table\n old_table = Table({ 'A':[1,1,1,2,2,2,3,3,3], 'B':[1,1,4,2,2,5,3,3,6], }) old_table Out[52]: #AB 011 111 214 322 422 525 633 733 836 In\u00a0[53]: Copied! ## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE)\nnew_table = old_table.drop_duplicates()\nnew_table\n ## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE) new_table = old_table.drop_duplicates() new_table 9it [00:00, 11329.15it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1819.26it/s]\n Out[53]: #AB 011 114 222 325 433 536 You can also use groupby; We'll get to that in a minute. Lookup is a special case of a search loop: Say for example you are planning a concert and want to make sure that your friends can make it home using public transport: You would have to find the first departure after the concert ends towards their home. A join would only give you a direct match on the time. Lookup allows you \"to iterate through a list of data and find the first match given a set of criteria.\" Here's an example: First we have our list of friends and their stops. In\u00a0[54]: Copied! friends = Table({\n\"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'],\n\"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'],\n})\nfriends\n friends = Table({ \"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'], \"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'], }) friends Out[54]: #namestop 0AliceDowntown-1 1BettyDowntown-2 2CharlieHillside View 3DorethyHillside Crescent 4EdwardDowntown-2 5FredChicago Next we need a list of bus routes and their time and stops. I don't have that, so I'm making one up: In\u00a0[55]: Copied! import random\nrandom.seed(11)\ntable_size = 40\n\ntimes = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)]\nstops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2',\n 'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)]\nroute = [random.choice([1, 2, 3]) for i in stops]\n import random random.seed(11) table_size = 40 times = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)] stops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2', 'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)] route = [random.choice([1, 2, 3]) for i in stops] In\u00a0[56]: Copied! bus_table = Table({\n\"time\":times,\n\"stop\":stops[:table_size],\n\"route\":route[:table_size],\n})\nbus_table.sort(mapping={'time': False})\n\nprint(\"Departures from Concert Hall towards ...\")\nbus_table[0:10]\n bus_table = Table({ \"time\":times, \"stop\":stops[:table_size], \"route\":route[:table_size], }) bus_table.sort(mapping={'time': False}) print(\"Departures from Concert Hall towards ...\") bus_table[0:10] creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1459.90it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 2421.65it/s]\n Departures from Concert Hall towards ...\n Out[56]: #timestoproute 021:02:00Random Road-62 121:05:00Hillside Crescent2 221:06:00Hillside1 321:25:00Random Road-241 421:29:00Random Road-161 521:32:00Random Road-211 621:33:00Random Road-121 721:36:00Random Road-233 821:38:00Central station2 921:38:00Random Road-82 Let's say the concerts ends at 21:00 and it takes a 10 minutes to get to the bus-stop. Earliest departure must then be 21:10 - goodbye hugs included. In\u00a0[57]: Copied! lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"<=\", 'time'), ('stop', \"==\", 'stop'))\nlookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix')\nlookup1_sorted\n lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"<=\", 'time'), ('stop', \"==\", 'stop')) lookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix') lookup1_sorted 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 1513.92it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 2003.65it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 2589.88it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:00<00:00, 2034.29it/s]\n Out[57]: #namestoptimestop_1route 0FredChicagoNoneNoneNone 1BettyDowntown-221:51:00Downtown-21 2EdwardDowntown-221:51:00Downtown-21 3CharlieHillside View22:19:00Hillside View2 4AliceDowntown-123:12:00Downtown-13 5DorethyHillside Crescent23:54:00Hillside Crescent1 Lookup's ability to custom criteria is thereby far more versatile than SQL joins. But with great power comes great responsibility. In\u00a0[58]: Copied! materials = Table({\n 'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9], \n 'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6], \n 'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'], \n 'material_id': [None, None, None, 3, None, 5, 3, 3, 5], \n 'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90]\n})\n # 9 is a partially packed pallet of 6\n\n## multiple values.\nlooking_for = Table({\n 'bom_id': [3,4,6], \n 'moq': [1,2,3]\n })\n materials = Table({ 'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6], 'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'], 'material_id': [None, None, None, 3, None, 5, 3, 3, 5], 'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90] }) # 9 is a partially packed pallet of 6 ## multiple values. looking_for = Table({ 'bom_id': [3,4,6], 'moq': [1,2,3] }) Our goals is now to find the quantity from the materials table based on the items in the looking_for table. This requires two steps: - lookup
- filter for
all by dropping items that didn't match. In\u00a0[59]: Copied! ## step 1/2:\nproducts_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False) \nproducts_lookup\n ## step 1/2: products_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False) products_lookup 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 9/9 [00:00<00:00, 3651.81it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1625.38it/s]\n Out[59]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 011ANone10NoneNone 122irrelevantNone20NoneNone 233empty cartonNone3031 344pkd carton34042 455empty palletNone50NoneNone 566pkd pallet56063 677pkd irrelevant370NoneNone 784ppkd carton38042 896ppkd pallet59063 In\u00a0[60]: Copied! ## step 2/2:\nproducts = products_lookup.all(bom_id_1=lambda x: x is not None)\nproducts\n ## step 2/2: products = products_lookup.all(bom_id_1=lambda x: x is not None) products Out[60]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 The faster way to solve this problem is to use match ! Here is the example: In\u00a0[61]: Copied! products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"))\nproducts_matched\n products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\")) products_matched Out[61]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 In\u00a0[62]: Copied! assert products == products_matched\n assert products == products_matched In\u00a0[63]: Copied! from tablite import Table\nt = Table() # create table\nt.add_columns('row','A','B','C') # add columns\n from tablite import Table t = Table() # create table t.add_columns('row','A','B','C') # add columns The following examples are all valid and append the row (1,2,3) to the table. In\u00a0[64]: Copied! t.add_rows(1, 1, 2, 3) # individual values\nt.add_rows([2, 1, 2, 3]) # list of values\nt.add_rows((3, 1, 2, 3)) # tuple of values\nt.add_rows(*(4, 1, 2, 3)) # unpacked tuple\nt.add_rows(row=5, A=1, B=2, C=3) # keyword - args\nt.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # dict / json.\n t.add_rows(1, 1, 2, 3) # individual values t.add_rows([2, 1, 2, 3]) # list of values t.add_rows((3, 1, 2, 3)) # tuple of values t.add_rows(*(4, 1, 2, 3)) # unpacked tuple t.add_rows(row=5, A=1, B=2, C=3) # keyword - args t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # dict / json. The following examples add two rows to the table In\u00a0[65]: Copied! t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # two (or more) tuples.\nt.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # two or more lists\nt.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}) # two (or more) dicts as args.\nt.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}]) # list of dicts.\n t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # two (or more) tuples. t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # two or more lists t.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3}, {'row': 12, 'A': 4, 'B': 5, 'C': 6}) # two (or more) dicts as args. t.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3}, {'row': 14, 'A': 1, 'B': 2, 'C': 3}]) # list of dicts. In\u00a0[66]: Copied! t\n t Out[66]: #rowABC 01123 12123 23123 34123 45123 56123 67123 78456 89123 9104561011123111245612131231314123 As the row incremented from 1 in the first of these examples, and finished with row: 14 , you can now see the whole table above In\u00a0[67]: Copied! from pathlib import Path\npath = Path('tests/data/book1.csv')\ntx = Table.from_file(path)\ntx\n from pathlib import Path path = Path('tests/data/book1.csv') tx = Table.from_file(path) tx Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 444.08it/s]\n Out[67]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 Note that you can also add start, limit and chunk_size to the file reader. Here's an example: In\u00a0[68]: Copied! path = Path('tests/data/book1.csv')\ntx2 = Table.from_file(path, start=2, limit=15)\ntx2\n path = Path('tests/data/book1.csv') tx2 = Table.from_file(path, start=2, limit=15) tx2 Collecting tasks: 'tests/data/book1.csv'\n importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 391.22it/s] Dumping tasks: 'tests/data/book1.csv'\n \n Out[68]: #abcdef 030.2424242420.4848484850.969696971.9393939393.878787879 140.4848484850.969696971.9393939393.8787878797.757575758 250.969696971.9393939393.8787878797.75757575815.51515152 361.9393939393.8787878797.75757575815.5151515231.03030303 473.8787878797.75757575815.5151515231.0303030362.06060606 587.75757575815.5151515231.0303030362.06060606124.1212121 6915.5151515231.0303030362.06060606124.1212121248.2424242 71031.0303030362.06060606124.1212121248.2424242496.4848485 81162.06060606124.1212121248.2424242496.4848485992.969697 912124.1212121248.2424242496.4848485992.9696971985.9393941013248.2424242496.4848485992.9696971985.9393943971.8787881114496.4848485992.9696971985.9393943971.8787887943.7575761215992.9696971985.9393943971.8787887943.75757615887.5151513161985.9393943971.8787887943.75757615887.5151531775.030314173971.8787887943.75757615887.5151531775.030363550.06061 How good is the file_reader? I've included all formats in the test suite that are publicly available from the Alan Turing institute, dateutils) and Python's csv reader. What about MM-DD-YYYY formats? Some users from the US ask why the csv reader doesn't read the month-day-year format. The answer is simple: It's not an iso8601 format. The US month-day-year format is a locale that may be used a lot in the US, but it isn't an international standard. If you need to work with MM-DD-YYYY you will find that the file_reader will import the values as text (str). You can then reformat it with a custom function like: In\u00a0[69]: Copied! s = \"03-21-1998\"\nfrom datetime import date\nf = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5]))\nf(s)\n s = \"03-21-1998\" from datetime import date f = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5])) f(s) Out[69]: datetime.date(1998, 3, 21) In\u00a0[70]: Copied! from tablite.import_utils import file_readers\nfor k,v in file_readers.items():\n print(k,v)\n from tablite.import_utils import file_readers for k,v in file_readers.items(): print(k,v) fods <function excel_reader at 0x7f36a3ef8c10>\njson <function excel_reader at 0x7f36a3ef8c10>\nhtml <function from_html at 0x7f36a3ef8b80>\nhdf5 <function from_hdf5 at 0x7f36a3ef8a60>\nsimple <function excel_reader at 0x7f36a3ef8c10>\nrst <function excel_reader at 0x7f36a3ef8c10>\nmediawiki <function excel_reader at 0x7f36a3ef8c10>\nxlsx <function excel_reader at 0x7f36a3ef8c10>\nxls <function excel_reader at 0x7f36a3ef8c10>\nxlsm <function excel_reader at 0x7f36a3ef8c10>\ncsv <function text_reader at 0x7f36a3ef9000>\ntsv <function text_reader at 0x7f36a3ef9000>\ntxt <function text_reader at 0x7f36a3ef9000>\nods <function ods_reader at 0x7f36a3ef8ca0>\n (2) define your new file reader In\u00a0[71]: Copied! def my_magic_reader(path, **kwargs): # define your new file reader.\n print(\"do magic with {path}\")\n return\n def my_magic_reader(path, **kwargs): # define your new file reader. print(\"do magic with {path}\") return (3) add it to the list of readers. In\u00a0[72]: Copied! file_readers['my_special_format'] = my_magic_reader\n file_readers['my_special_format'] = my_magic_reader The file_readers are all in tablite.core so if you intend to extend the readers, I recommend that you start here. In\u00a0[73]: Copied! file = Path('example.xlsx')\ntx2.to_xlsx(file)\nos.remove(file)\n file = Path('example.xlsx') tx2.to_xlsx(file) os.remove(file) In\u00a0[74]: Copied! from tablite import Table\n\nt = Table({\n'a':[1, 2, 8, 3, 4, 6, 5, 7, 9],\n'b':[10, 100, 3, 4, 16, -1, 10, 10, 10],\n})\nt.sort(mapping={\"a\":False})\nt\n from tablite import Table t = Table({ 'a':[1, 2, 8, 3, 4, 6, 5, 7, 9], 'b':[10, 100, 3, 4, 16, -1, 10, 10, 10], }) t.sort(mapping={\"a\":False}) t creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1674.37it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1701.89it/s]\n Out[74]: #ab 0110 12100 234 3416 4510 56-1 6710 783 8910 In\u00a0[75]: Copied! %pip install matplotlib -q\n %pip install matplotlib -q Note: you may need to restart the kernel to use updated packages.\n In\u00a0[76]: Copied! import matplotlib.pyplot as plt\nplt.plot(t['a'], t['b'])\nplt.ylabel('Hello Figure')\nplt.show()\n import matplotlib.pyplot as plt plt.plot(t['a'], t['b']) plt.ylabel('Hello Figure') plt.show() In\u00a0[77]: Copied! ## Let's monitor the memory and record the observations into a table!\nimport psutil, os, gc\nfrom time import process_time,sleep\nprocess = psutil.Process(os.getpid())\n\ndef mem_time(): # go and check taskmanagers memory usage.\n return process.memory_info().rss, process_time()\n\ndigits = 1_000_000\n\nrecords = Table({'method':[], 'memory':[], 'time':[]})\n ## Let's monitor the memory and record the observations into a table! import psutil, os, gc from time import process_time,sleep process = psutil.Process(os.getpid()) def mem_time(): # go and check taskmanagers memory usage. return process.memory_info().rss, process_time() digits = 1_000_000 records = Table({'method':[], 'memory':[], 'time':[]}) The row based format: 1 million 10-tuples In\u00a0[78]: Copied! before, start = mem_time()\nL = [tuple([11 for _ in range(10)]) for _ in range(digits)]\nafter, end = mem_time() \ndel L\ngc.collect()\n\nrecords.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4)))\nrecords\n before, start = mem_time() L = [tuple([11 for _ in range(10)]) for _ in range(digits)] after, end = mem_time() del L gc.collect() records.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4))) records Out[78]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 The column based format: 10 columns with 1M values: In\u00a0[79]: Copied! before, start = mem_time()\nL = [[11 for i2 in range(digits)] for i1 in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4)))\n before, start = mem_time() L = [[11 for i2 in range(digits)] for i1 in range(10)] after,end = mem_time() del L gc.collect() records.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4))) We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists. Q: But why didn't I just use an array? It would have even lower memory footprint. A: First, array's don't handle None's and we get that frequently in dirty csv data. Second, Table needs even less memory. Let's try with an array: In\u00a0[80]: Copied! import array\n\nbefore, start = mem_time()\nL = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4)))\nrecords\n import array before, start = mem_time() L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)] after,end = mem_time() del L gc.collect() records.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4))) records Out[80]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 Finally let's use a tablite.Table : In\u00a0[81]: Copied! before,start = mem_time()\nt = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)})\nafter,end = mem_time()\n\nrecords.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4)))\n\nbefore,start = mem_time()\nt2 = t.copy()\nafter,end = mem_time()\n\nrecords.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4)))\n\n## Let's show it, so we know nobody's cheating:\nt2\n before,start = mem_time() t = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)}) after,end = mem_time() records.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4))) before,start = mem_time() t2 = t.copy() after,end = mem_time() records.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4))) ## Let's show it, so we know nobody's cheating: t2 Out[81]: #0123456789 011111111111111111111 111111111111111111111 211111111111111111111 311111111111111111111 411111111111111111111 511111111111111111111 611111111111111111111................................. 999,99311111111111111111111 999,99411111111111111111111 999,99511111111111111111111 999,99611111111111111111111 999,99711111111111111111111 999,99811111111111111111111 999,99911111111111111111111 In\u00a0[82]: Copied! records\n records Out[82]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 3Table with 10 columns with 1e6 integers01.9569 42 Tables with 10 columns with 1e6 integers each00.0001 Conclusion: whilst the common worst case (1M lists with 10 integers) take up 118 Mb of RAM, Tablite's tables vanish in the noise of memory measurement. Pandas also permits the usage of namedtuples, which are unpacked upon entry. from collections import namedtuple\nPoint = namedtuple(\"Point\", \"x y\")\npoints = [Point(0, 0), Point(0, 3)]\npd.DataFrame(points)\n Doing that in tablite is a bit different. To unpack the named tuple, you should do so explicitly: t = Table({'x': [p.x for p in points], 'y': [p.y for p in points]})\n However should you want to keep the points as namedtuple, you can do so in tablite: t = Table()\nt['points'] = points\n Tablite will store a serialised version of the points, so your memory overhead will be close to zero. "},{"location":"tutorial/#tablite","title":"Tablite\u00b6","text":""},{"location":"tutorial/#introduction","title":"Introduction\u00b6","text":"Tablite fills the data-science space where incremental data processing based on: - Datasets are larger than memory.
- You don't want to worry about datatypes.
Tablite thereby competes with: - Pandas, but saves the memory overhead.
- Numpy, but spares you from worrying about lower level data types
- SQlite, by sheer speed.
- Polars, by working beyond RAM.
- Other libraries for data cleaning thanks to tablites powerful
datatypes module. Install: pip install tablite Usage: >>> from tablite import Table Upgrade: pip install tablite --no-cache --upgrade "},{"location":"tutorial/#overview","title":"Overview\u00b6","text":"(Version 2023.6.0 and later. For older version see this) - Tablite handles all Python datatypes:
str , float , bool , int , date , datetime , time , timedelta and None . - you can select:
- all rows in a column as
table['A'] - rows across all columns as
table[4:8] - or a slice as
table['A', 'B', slice(4,8) ] . - you to update with
table['A'][2] = new value - you can store or send data using json, by:
- dumping to json:
json_str = table.to_json() , or - you can load it with
Table.from_json(json_str) . - you can iterate over rows using
for row in Table.rows . - you can ask
column_xyz in Table.colums ? - load from files with
new_table = Table.from_file('this.csv') which has automatic datatype detection - perform inner, outer & left sql join between tables as simple as
table_1.inner_join(table2, keys=['A', 'B']) - summarise using
table.groupby( ... ) - create pivot tables using
groupby.pivot( ... ) - perform multi-criteria lookup in tables using
table1.lookup(table2, criteria=..... - and of course a large selection of tools in
from tablite.tools import * "},{"location":"tutorial/#examples","title":"Examples\u00b6","text":"Here are some examples: "},{"location":"tutorial/#api-examples","title":"API Examples\u00b6","text":"In the following sections, example are given of the Tablite API's power features: - Iteration
- Append
- Sort
- Filter
- Index
- Search All
- Search Any
- Lookup
- Join inner, outer,
- GroupBy
- Pivot table
"},{"location":"tutorial/#iteration","title":"ITERATION!\u00b6","text":"Iteration supports for loops and list comprehension at the speed of light: Just use [r for r in table.rows] , or: for row in table.rows:\n row ... Here's a more practical use case: (1) Imagine a table with columns a,b,c,d,e (all integers) like this: "},{"location":"tutorial/#create-index-indices","title":"Create Index / Indices\u00b6","text":"Index supports multi-key indexing using args such as: index = table.index('B','C') . Here's an example: "},{"location":"tutorial/#append","title":"APPEND\u00b6","text":""},{"location":"tutorial/#save","title":"SAVE\u00b6","text":""},{"location":"tutorial/#filter","title":"FILTER!\u00b6","text":""},{"location":"tutorial/#any-all","title":"Any! All?\u00b6","text":"Any and All are cousins of the filter. They're there so you can use them in the same way as you'd use any and all in python - as boolean evaluators: "},{"location":"tutorial/#sort","title":"SORT!\u00b6","text":""},{"location":"tutorial/#groupby","title":"GROUPBY !\u00b6","text":""},{"location":"tutorial/#did-i-say-pivot-table-yes","title":"Did I say pivot table? Yes.\u00b6","text":"Pivot Table is included in the groupby functionality - so yes - you can pivot the groupby on any column that is used for grouping. Here's a simple example: "},{"location":"tutorial/#join","title":"JOIN!\u00b6","text":""},{"location":"tutorial/#lookup","title":"LOOKUP!\u00b6","text":""},{"location":"tutorial/#match","title":"Match\u00b6","text":"If you're looking to do a join where you afterwards remove the empty rows, match is the faster choice. Here is an example. Let's start with two tables: "},{"location":"tutorial/#are-there-other-ways-i-can-add-data","title":"Are there other ways I can add data?\u00b6","text":"Yes - but row based operations cause a lot of IO, so it'll work but be slower: "},{"location":"tutorial/#okay-great-how-do-i-load-data","title":"Okay, great. How do I load data?\u00b6","text":"Easy. Use file_reader . Here's an example: "},{"location":"tutorial/#sweet-what-formats-are-supported-can-i-add-my-own-file-reader","title":"Sweet. What formats are supported? Can I add my own file reader?\u00b6","text":"Yes! This is very good for special log files or custom json formats. Here's how you do it: (1) Go to all existing readers in the tablite.core and find the closest match. "},{"location":"tutorial/#very-nice-how-about-exporting-data","title":"Very nice. How about exporting data?\u00b6","text":"Just use .export "},{"location":"tutorial/#cool-does-it-play-well-with-plotting-packages","title":"Cool. Does it play well with plotting packages?\u00b6","text":"Yes. Here's an example you can copy and paste: "},{"location":"tutorial/#i-like-sql-can-tablite-understand-sql","title":"I like sql. Can tablite understand SQL?\u00b6","text":"Almost. You can use table.to_sql and tablite will return ANSI-92 compliant SQL. You can also create a table using Table.from_sql and tablite will consume ANSI-92 compliant SQL. "},{"location":"tutorial/#but-what-do-i-do-if-im-about-to-run-out-of-memory","title":"But what do I do if I'm about to run out of memory?\u00b6","text":"You wont. Every tablite table is backed by disk. The memory footprint of a table is only the metadata required to know the relationships between variable names and the datastructures. Let's do a comparison: "},{"location":"tutorial/#conclusions","title":"Conclusions\u00b6","text":"This concludes the mega-tutorial to tablite . There's nothing more to it. But oh boy it'll save a lot of time. Here's a summary of features: - Everything a list can do.
- import csv*, fods, json, html, simple, rst, mediawiki, xlsx, xls, xlsm, csv, tsv, txt, ods using
Table.from_file(...) - Iterate over rows or columns
- Create multikey
index , sort , use filter , any and all to select. Perform lookup across tables including using custom functions. - Perform multikey
joins with other tables. - Perform
groupby and reorganise data as a pivot table with max, min, sum, first, last, count, unique, average, standard deviation, median and mode. - Update tables with
+= which automatically sorts out the columns - even if they're not in perfect order. "},{"location":"tutorial/#faq","title":"FAQ\u00b6","text":"Question Answer I'm not in a notebook. Is there a nice way to view tables? Yes. table.show() prints the ascii version I'm looking for the equivalent to apply in pandas. Just use list comprehensions: table[column] = [f(x) for x in table[column] What about map ? Just use the python function: mapping = map(f, table[column name]) Is there a where function? It's called any or all like in python: table.any(column_name > 0) . I like sql and sqlite. Can I use sql? Yes. Call table.to_sql() returns ANSI-92 SQL compliant table definition.You can use this in any SQL compliant engine. | sometimes i need to clean up data with datetimes. Is there any tool to help with that? | Yes. Look at DataTypes.DataTypes.round(value, multiple) allows rounding of datetime. "},{"location":"tutorial/#coming-to-tablite-from-pandas","title":"Coming to Tablite from Pandas\u00b6","text":"If you're coming to Tablite from Pandas you will notice some differences. Here's the ultra short comparison to the documentation from Pandas called 10 minutes intro to pandas The tutorials provide the generic overview: - pandas tutorial
- tablite tutorial
Some key differences topic Tablite Viewing data Just use table.show() in print outs, or if you're in a jupyter notebook just use the variable name table Selection Slicing works both on columns and rows, and you can filter using any or all :table['A','B', 2:30:3].any(A=lambda x:x>3) to copy a table use: t2 = t.copy() This is a very fast deep copy, that has no memory overhead as tablites memory manager keeps track of the data. Missing data Tablite uses mixed column format for any format that isn't uniformTo get rid of rows with None s and np.nan s use any:table.drop_na(None, np.nan) Alternatively you can use replace: table.replace(None,5) following the syntax: table.replace_missing_values(sources, target) Operations Descriptive statistics are on a colum by column basis:table['a'].statistics() the pandas function df.apply doesn't exist in tablite. Use a list comprehension instead. For example: df.apply(np.cumsum) is just np.cumsum(t['A']) \"histogramming\" in tablite is per column: table['a'].histogram() string methods? Just use a list comprehensions: table['A', 'B'].any(A=lambda x: \"hello\" in x, B=lambda x: \"world\" in x) Merge Concatenation: Just use + or += as in t1 = t2 + t3 += t4 . If the columns are out of order, tablite will sort the headers according to the order in the first table.If you're worried that the header mismatch use t1.stack(t2) Joins are ANSI92 compliant: t1.join(t2, <...args...>, join_type=...) . Grouping Tablite supports multikey groupby using from tablite import Groupby as gb . table.groupby(keys, functions) Reshaping To reshape a table use transpose . to perform pivot table like operations, use: table.pivot(rows, columns, functions) subtotals aside tablite will give you everything Excels pivot table can do. Time series To convert time series use a list comprehension.t1['GMT'] = [timedelta(hours=1) + v for v in t1['date'] ] to generate a date range use:from Tablite import daterange t['date'] = date_range(start=2022/1/1, stop=2023/1/1, step=timedelta(days=1)) Categorical Pandas only seems to use this for sorting and grouping. Tablite table has .sort , .groupby and .pivot to achieve the same task. Plotting Import your favorite plotting package and feed it the values, such as:import matplotlib.pyplot as plt plt.plot(t['a'],t['b']) plt.showw() Import/Export Tablite supports the same import/export options as pandas.Tablite pegs the free memory before IO and can therefore process larger-than-RAM files. Tablite also guesses the datatypes for all ISOformats and uses multiprocessing and may therefore be faster. Should you want to inspect how guess works, use from tools import guess and try the function out. Gotchas None really. Should you come across something non-pythonic, then please post it on the issue list."},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#tablite.base","title":"tablite.base ","text":""},{"location":"reference/base/#tablite.base-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.log","title":"tablite.base.log = logging.getLogger(__name__) module-attribute ","text":""},{"location":"reference/base/#tablite.base.file_registry","title":"tablite.base.file_registry = set() module-attribute ","text":""},{"location":"reference/base/#tablite.base-classes","title":"Classes","text":""},{"location":"reference/base/#tablite.base.SimplePage","title":"tablite.base.SimplePage(id, path, len, py_dtype) ","text":" Bases: object Source code in tablite/base.py def __init__(self, id, path, len, py_dtype) -> None:\n self.path = Path(path) / \"pages\" / f\"{id}.npy\"\n self.len = len\n self.dtype = py_dtype\n\n self._incr_refcount()\n "},{"location":"reference/base/#tablite.base.SimplePage-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.SimplePage.ids","title":"tablite.base.SimplePage.ids = count(start=1) class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.refcounts","title":"tablite.base.SimplePage.refcounts = {} class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.autocleanup","title":"tablite.base.SimplePage.autocleanup = True class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.path","title":"tablite.base.SimplePage.path = Path(path) / 'pages' / f'{id}.npy' instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.len","title":"tablite.base.SimplePage.len = len instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.dtype","title":"tablite.base.SimplePage.dtype = py_dtype instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.SimplePage.__setstate__","title":"tablite.base.SimplePage.__setstate__(state) ","text":"when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called Source code in tablite/base.py def __setstate__(self, state):\n \"\"\"\n when an object is unpickled, say in a case of multi-processing,\n object.__setstate__(state) is called instead of __init__, this means\n we need to update page refcount as if constructor had been called\n \"\"\"\n self.__dict__.update(state)\n\n self._incr_refcount()\n "},{"location":"reference/base/#tablite.base.SimplePage.next_id","title":"tablite.base.SimplePage.next_id(path) classmethod ","text":"Source code in tablite/base.py @classmethod\ndef next_id(cls, path):\n path = Path(path)\n\n while True:\n _id = f\"{os.getpid()}-{next(cls.ids)}\"\n _path = path / \"pages\" / f\"{_id}.npy\"\n\n if not _path.exists():\n break # make sure we don't override existing pages if they are created outside of main thread\n\n return _id\n "},{"location":"reference/base/#tablite.base.SimplePage.__len__","title":"tablite.base.SimplePage.__len__() ","text":"Source code in tablite/base.py def __len__(self):\n return self.len\n "},{"location":"reference/base/#tablite.base.SimplePage.__repr__","title":"tablite.base.SimplePage.__repr__() -> str ","text":"Source code in tablite/base.py def __repr__(self) -> str:\n try:\n return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n except FileNotFoundError as e:\n return f\"{self.__class__.__name__}({self.path}, <{type(e).__name__}>)\"\n except Exception as e:\n return f\"{self.__class__.__name__}({self.path}, <{e}>)\"\n "},{"location":"reference/base/#tablite.base.SimplePage.__hash__","title":"tablite.base.SimplePage.__hash__() -> int ","text":"Source code in tablite/base.py def __hash__(self) -> int:\n return hash(self.path)\n "},{"location":"reference/base/#tablite.base.SimplePage.owns","title":"tablite.base.SimplePage.owns() ","text":"Source code in tablite/base.py def owns(self):\n parts = self.path.parts\n\n return all((p in parts for p in Path(Config.pid).parts))\n "},{"location":"reference/base/#tablite.base.SimplePage.__del__","title":"tablite.base.SimplePage.__del__() ","text":"When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data. Source code in tablite/base.py def __del__(self):\n \"\"\"When python's reference count for an object is 0, python uses\n it's garbage collector to remove the object and free the memory.\n As tablite tables have columns and columns have page and pages have\n data stored on disk, the space on disk must be freed up as well.\n This __del__ override assures the cleanup of stored data.\n \"\"\"\n if not self.owns():\n return\n\n refcount = self.refcounts[self.path] = max(\n self.refcounts.get(self.path, 0) - 1, 0\n )\n\n if refcount > 0:\n return\n\n if self.autocleanup:\n self.path.unlink(True)\n\n del self.refcounts[self.path]\n "},{"location":"reference/base/#tablite.base.SimplePage.get","title":"tablite.base.SimplePage.get() ","text":"loads stored data RETURNS DESCRIPTION np.ndarray: stored data. Source code in tablite/base.py def get(self):\n \"\"\"loads stored data\n\n Returns:\n np.ndarray: stored data.\n \"\"\"\n array = load_numpy(self.path)\n return MetaArray(array, array.dtype, py_dtype=self.dtype)\n "},{"location":"reference/base/#tablite.base.Page","title":"tablite.base.Page(path, array) ","text":" Bases: SimplePage PARAMETER DESCRIPTION path working directory. TYPE: Path array data TYPE: array Source code in tablite/base.py def __init__(self, path, array) -> None:\n \"\"\"\n Args:\n path (Path): working directory.\n array (np.array): data\n \"\"\"\n _id = self.next_id(path)\n\n type_check(array, np.ndarray)\n\n if Config.DISK_LIMIT <= 0:\n pass\n else:\n _, _, free = shutil.disk_usage(path)\n if free - array.nbytes < Config.DISK_LIMIT:\n msg = \"\\n\".join(\n [\n f\"Disk limit reached: Config.DISK_LIMIT = {Config.DISK_LIMIT:,} bytes.\",\n f\"array requires {array.nbytes:,} bytes, but only {free:,} bytes are free.\",\n \"To disable this check, use:\",\n \">>> from tablite.config import Config\",\n \">>> Config.DISK_LIMIT = 0\",\n \"To free space, clean up Config.workdir:\",\n f\"{Config.workdir}\",\n ]\n )\n raise OSError(msg)\n\n _len = len(array)\n # type_check(array, MetaArray)\n if not hasattr(array, \"metadata\"):\n raise ValueError\n _dtype = array.metadata[\"py_dtype\"]\n\n super().__init__(_id, path, _len, _dtype)\n\n np.save(self.path, array, allow_pickle=True, fix_imports=False)\n log.debug(f\"Page saved: {self.path}\")\n "},{"location":"reference/base/#tablite.base.Page-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Page.ids","title":"tablite.base.Page.ids = count(start=1) class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.refcounts","title":"tablite.base.Page.refcounts = {} class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.autocleanup","title":"tablite.base.Page.autocleanup = True class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.path","title":"tablite.base.Page.path = Path(path) / 'pages' / f'{id}.npy' instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.len","title":"tablite.base.Page.len = len instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.dtype","title":"tablite.base.Page.dtype = py_dtype instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Page.__setstate__","title":"tablite.base.Page.__setstate__(state) ","text":"when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called Source code in tablite/base.py def __setstate__(self, state):\n \"\"\"\n when an object is unpickled, say in a case of multi-processing,\n object.__setstate__(state) is called instead of __init__, this means\n we need to update page refcount as if constructor had been called\n \"\"\"\n self.__dict__.update(state)\n\n self._incr_refcount()\n "},{"location":"reference/base/#tablite.base.Page.next_id","title":"tablite.base.Page.next_id(path) classmethod ","text":"Source code in tablite/base.py @classmethod\ndef next_id(cls, path):\n path = Path(path)\n\n while True:\n _id = f\"{os.getpid()}-{next(cls.ids)}\"\n _path = path / \"pages\" / f\"{_id}.npy\"\n\n if not _path.exists():\n break # make sure we don't override existing pages if they are created outside of main thread\n\n return _id\n "},{"location":"reference/base/#tablite.base.Page.__len__","title":"tablite.base.Page.__len__() ","text":"Source code in tablite/base.py def __len__(self):\n return self.len\n "},{"location":"reference/base/#tablite.base.Page.__repr__","title":"tablite.base.Page.__repr__() -> str ","text":"Source code in tablite/base.py def __repr__(self) -> str:\n try:\n return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n except FileNotFoundError as e:\n return f\"{self.__class__.__name__}({self.path}, <{type(e).__name__}>)\"\n except Exception as e:\n return f\"{self.__class__.__name__}({self.path}, <{e}>)\"\n "},{"location":"reference/base/#tablite.base.Page.__hash__","title":"tablite.base.Page.__hash__() -> int ","text":"Source code in tablite/base.py def __hash__(self) -> int:\n return hash(self.path)\n "},{"location":"reference/base/#tablite.base.Page.owns","title":"tablite.base.Page.owns() ","text":"Source code in tablite/base.py def owns(self):\n parts = self.path.parts\n\n return all((p in parts for p in Path(Config.pid).parts))\n "},{"location":"reference/base/#tablite.base.Page.__del__","title":"tablite.base.Page.__del__() ","text":"When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data. Source code in tablite/base.py def __del__(self):\n \"\"\"When python's reference count for an object is 0, python uses\n it's garbage collector to remove the object and free the memory.\n As tablite tables have columns and columns have page and pages have\n data stored on disk, the space on disk must be freed up as well.\n This __del__ override assures the cleanup of stored data.\n \"\"\"\n if not self.owns():\n return\n\n refcount = self.refcounts[self.path] = max(\n self.refcounts.get(self.path, 0) - 1, 0\n )\n\n if refcount > 0:\n return\n\n if self.autocleanup:\n self.path.unlink(True)\n\n del self.refcounts[self.path]\n "},{"location":"reference/base/#tablite.base.Page.get","title":"tablite.base.Page.get() ","text":"loads stored data RETURNS DESCRIPTION np.ndarray: stored data. Source code in tablite/base.py def get(self):\n \"\"\"loads stored data\n\n Returns:\n np.ndarray: stored data.\n \"\"\"\n array = load_numpy(self.path)\n return MetaArray(array, array.dtype, py_dtype=self.dtype)\n "},{"location":"reference/base/#tablite.base.Column","title":"tablite.base.Column(path, value=None) ","text":" Bases: object Create Column PARAMETER DESCRIPTION path path of table.yml (defaults: Config.pid_dir) TYPE: Path value Data to store. Defaults to None. TYPE: Iterable DEFAULT: None Source code in tablite/base.py def __init__(self, path, value=None) -> None:\n \"\"\"Create Column\n\n Args:\n path (Path): path of table.yml (defaults: Config.pid_dir)\n value (Iterable, optional): Data to store. Defaults to None.\n \"\"\"\n self.path = path\n self.pages = [] # keeps pointers to instances of Page\n if value is not None:\n self.extend(value)\n "},{"location":"reference/base/#tablite.base.Column-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Column.path","title":"tablite.base.Column.path = path instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Column.pages","title":"tablite.base.Column.pages = [] instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Column-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Column.__len__","title":"tablite.base.Column.__len__() ","text":"Source code in tablite/base.py def __len__(self):\n return sum(len(p) for p in self.pages)\n "},{"location":"reference/base/#tablite.base.Column.__repr__","title":"tablite.base.Column.__repr__() ","text":"Source code in tablite/base.py def __repr__(self):\n return f\"{self.__class__.__name__}({self.path}, {self[:]})\"\n "},{"location":"reference/base/#tablite.base.Column.repaginate","title":"tablite.base.Column.repaginate() ","text":"resizes pages to Config.PAGE_SIZE Source code in tablite/base.py def repaginate(self):\n \"\"\"resizes pages to Config.PAGE_SIZE\"\"\"\n from tablite.nimlite import repaginate as _repaginate\n\n _repaginate(self)\n "},{"location":"reference/base/#tablite.base.Column.extend","title":"tablite.base.Column.extend(value) ","text":"extends the column. PARAMETER DESCRIPTION value data TYPE: ndarray Source code in tablite/base.py def extend(self, value): # USER FUNCTION.\n \"\"\"extends the column.\n\n Args:\n value (np.ndarray): data\n \"\"\"\n if isinstance(value, Column):\n self.pages.extend(value.pages[:])\n return\n elif isinstance(value, np.ndarray):\n pass\n elif isinstance(value, (list, tuple)):\n value = list_to_np_array(value)\n else:\n raise TypeError(f\"Cannot extend Column with {type(value)}\")\n type_check(value, np.ndarray)\n for array in self._paginate(value):\n self.pages.append(Page(path=self.path, array=array))\n "},{"location":"reference/base/#tablite.base.Column.clear","title":"tablite.base.Column.clear() ","text":"clears the column. Like list().clear() Source code in tablite/base.py def clear(self):\n \"\"\"\n clears the column. Like list().clear()\n \"\"\"\n self.pages.clear()\n "},{"location":"reference/base/#tablite.base.Column.getpages","title":"tablite.base.Column.getpages(item) ","text":"public non-user function to identify any pages + slices of data to be retrieved given a slice (item) PARAMETER DESCRIPTION item target slice of data TYPE: (int, slice) RETURNS DESCRIPTION list of pages/np.ndarrays. Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)] This helps, for example when creating a copy, as the copy can reference the pages 1 and 2 and only need to store the np.ndarray that is unique to it. Source code in tablite/base.py def getpages(self, item):\n \"\"\"public non-user function to identify any pages + slices\n of data to be retrieved given a slice (item)\n\n Args:\n item (int,slice): target slice of data\n\n Returns:\n list of pages/np.ndarrays.\n\n Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)]\n This helps, for example when creating a copy, as the copy\n can reference the pages 1 and 2 and only need to store\n the np.ndarray that is unique to it.\n \"\"\"\n # internal function\n if isinstance(item, int):\n if item < 0:\n item = len(self) + item\n item = slice(item, item + 1, 1)\n\n type_check(item, slice)\n is_reversed = False if (item.step is None or item.step > 0) else True\n\n length = len(self)\n scan_item = slice(*item.indices(length))\n range_item = range(*item.indices(length))\n\n pages = []\n start, end = 0, 0\n for page in self.pages:\n start, end = end, end + page.len\n if is_reversed:\n if start > scan_item.start:\n break\n if end < scan_item.stop:\n continue\n else:\n if start > scan_item.stop:\n break\n if end < scan_item.start:\n continue\n ro = intercept(range(start, end), range_item)\n if len(ro) == 0:\n continue\n elif len(ro) == page.len: # share the whole immutable page\n pages.append(page)\n else: # fetch the slice and filter it.\n search_slice = slice(ro.start - start, ro.stop - start, ro.step)\n np_arr = load_numpy(page.path)\n match = np_arr[search_slice]\n pages.append(match)\n\n if is_reversed:\n pages.reverse()\n for ix, page in enumerate(pages):\n if isinstance(page, SimplePage):\n data = page.get()\n pages[ix] = np.flip(data)\n else:\n pages[ix] = np.flip(page)\n\n return pages\n "},{"location":"reference/base/#tablite.base.Column.iter_by_page","title":"tablite.base.Column.iter_by_page() ","text":"iterates over the column, page by page. This method minimizes the number of reads. RETURNS DESCRIPTION generator of tuple: start: int end: int data: np.ndarray Source code in tablite/base.py def iter_by_page(self):\n \"\"\"iterates over the column, page by page.\n This method minimizes the number of reads.\n\n Returns:\n generator of tuple:\n start: int\n end: int\n data: np.ndarray\n \"\"\"\n start, end = 0, 0\n for page in self.pages:\n start, end = end, end + page.len\n yield start, end, page\n "},{"location":"reference/base/#tablite.base.Column.__getitem__","title":"tablite.base.Column.__getitem__(item) ","text":"gets numpy array. PARAMETER DESCRIPTION item slice of column TYPE: int OR slice RETURNS DESCRIPTION np.ndarray: results as numpy array. Remember: >>> R = np.array([0,1,2,3,4,5])\n>>> R[3]\n3\n>>> R[3:4]\narray([3])\n Source code in tablite/base.py def __getitem__(self, item): # USER FUNCTION.\n \"\"\"gets numpy array.\n\n Args:\n item (int OR slice): slice of column\n\n Returns:\n np.ndarray: results as numpy array.\n\n Remember:\n ```\n >>> R = np.array([0,1,2,3,4,5])\n >>> R[3]\n 3\n >>> R[3:4]\n array([3])\n ```\n \"\"\"\n result = []\n for element in self.getpages(item):\n if isinstance(element, SimplePage):\n result.append(element.get())\n else:\n result.append(element)\n\n if result:\n arr = np_type_unify(result)\n else:\n arr = np.array([])\n\n if isinstance(item, int):\n if len(arr) == 0:\n raise IndexError(\n f\"index {item} is out of bounds for axis 0 with size {len(self)}\"\n )\n return numpy_to_python(arr[0])\n else:\n return arr\n "},{"location":"reference/base/#tablite.base.Column.__setitem__","title":"tablite.base.Column.__setitem__(key, value) ","text":"sets values. PARAMETER DESCRIPTION key selector TYPE: (int, slice) value values to insert TYPE: any RAISES DESCRIPTION KeyError Following normal slicing rules Source code in tablite/base.py def __setitem__(self, key, value): # USER FUNCTION.\n \"\"\"sets values.\n\n Args:\n key (int,slice): selector\n value (any): values to insert\n\n Raises:\n KeyError: Following normal slicing rules\n \"\"\"\n if isinstance(key, int):\n self._setitem_integer_key(key, value)\n\n elif isinstance(key, slice):\n if not isinstance(value, np.ndarray):\n value = list_to_np_array(value)\n type_check(value, np.ndarray)\n\n if key.start is None and key.stop is None and key.step in (None, 1):\n self._setitem_replace_all(key, value)\n elif key.start is not None and key.stop is None and key.step in (None, 1):\n self._setitem_extend(key, value)\n elif key.stop is not None and key.start is None and key.step in (None, 1):\n self._setitem_prextend(key, value)\n elif (\n key.step in (None, 1) and key.start is not None and key.stop is not None\n ):\n self._setitem_insert(key, value)\n elif key.step not in (None, 1):\n self._setitem_update(key, value)\n else:\n raise KeyError(f\"bad key: {key}\")\n else:\n raise KeyError(f\"bad key: {key}\")\n "},{"location":"reference/base/#tablite.base.Column.__delitem__","title":"tablite.base.Column.__delitem__(key) ","text":"deletes items selected by key PARAMETER DESCRIPTION key selector TYPE: (int, slice) RAISES DESCRIPTION KeyError following normal slicing rules. Source code in tablite/base.py def __delitem__(self, key): # USER FUNCTION\n \"\"\"deletes items selected by key\n\n Args:\n key (int,slice): selector\n\n Raises:\n KeyError: following normal slicing rules.\n \"\"\"\n if isinstance(key, int):\n self._del_by_int(key)\n elif isinstance(key, slice):\n self._del_by_slice(key)\n else:\n raise KeyError(f\"bad key: {key}\")\n "},{"location":"reference/base/#tablite.base.Column.get_by_indices","title":"tablite.base.Column.get_by_indices(indices: Union[List[int], np.ndarray]) -> np.ndarray ","text":"retrieves values from column given a set of indices. PARAMETER DESCRIPTION indices targets TYPE: array This method uses np.take, is faster than iterating over rows. Examples: >>> indices = np.array(list(range(3,700_700, 426)))\n>>> arr = np.array(list(range(2_000_000)))\nPythonic:\n>>> [v for i,v in enumerate(arr) if i in indices]\nNumpyionic:\n>>> np.take(arr, indices)\n Source code in tablite/base.py def get_by_indices(self, indices: Union[List[int], np.ndarray]) -> np.ndarray:\n \"\"\"retrieves values from column given a set of indices.\n\n Args:\n indices (np.array): targets\n\n This method uses np.take, is faster than iterating over rows.\n Examples:\n ```\n >>> indices = np.array(list(range(3,700_700, 426)))\n >>> arr = np.array(list(range(2_000_000)))\n Pythonic:\n >>> [v for i,v in enumerate(arr) if i in indices]\n Numpyionic:\n >>> np.take(arr, indices)\n ```\n \"\"\"\n type_check(indices, np.ndarray)\n\n dtypes = set()\n values = np.empty(\n indices.shape, dtype=object\n ) # placeholder for the indexed values.\n\n for start, end, page in self.iter_by_page():\n range_match = np.asarray(((indices >= start) & (indices < end)) | (indices == -1)).nonzero()[0]\n if len(range_match):\n # only fetch the data if there's a range match!\n data = page.get() \n sub_index = np.take(indices, range_match)\n # sub_index2 otherwise will raise index error where len(data) > (-1 - start)\n # so the clause below is required:\n if len(data) > (-1 - start):\n sub_index = np.where(sub_index == -1, -1, sub_index - start)\n arr = np.take(data, sub_index)\n dtypes.add(arr.dtype)\n np.put(values, range_match, arr)\n\n if len(dtypes) == 1: # simplify the datatype\n dtype = next(iter(dtypes))\n values = np.array(values, dtype=dtype)\n return values\n "},{"location":"reference/base/#tablite.base.Column.__iter__","title":"tablite.base.Column.__iter__() ","text":"Source code in tablite/base.py def __iter__(self): # USER FUNCTION.\n for page in self.pages:\n data = page.get()\n for value in data:\n yield value\n "},{"location":"reference/base/#tablite.base.Column.__eq__","title":"tablite.base.Column.__eq__(other) ","text":"compares two columns. Like list1 == list2 Source code in tablite/base.py def __eq__(self, other): # USER FUNCTION.\n \"\"\"\n compares two columns. Like `list1 == list2`\n \"\"\"\n if len(self) != len(other): # quick cheap check.\n return False\n\n if isinstance(other, (list, tuple)):\n return all(a == b for a, b in zip(self[:], other))\n\n elif isinstance(other, Column):\n if self.pages == other.pages: # special case.\n return True\n\n # are the pages of same size?\n if len(self.pages) == len(other.pages):\n if [p.len for p in self.pages] == [p.len for p in other.pages]:\n for a, b in zip(self.pages, other.pages):\n if not (a.get() == b.get()).all():\n return False\n return True\n # to bad. Element comparison it is then:\n for a, b in zip(iter(self), iter(other)):\n if a != b:\n return False\n return True\n\n elif isinstance(other, np.ndarray):\n start, end = 0, 0\n for p in self.pages:\n start, end = end, end + p.len\n if not (p.get() == other[start:end]).all():\n return False\n return True\n else:\n raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n "},{"location":"reference/base/#tablite.base.Column.__ne__","title":"tablite.base.Column.__ne__(other) ","text":"compares two columns. Like list1 != list2 Source code in tablite/base.py def __ne__(self, other): # USER FUNCTION\n \"\"\"\n compares two columns. Like `list1 != list2`\n \"\"\"\n if len(self) != len(other): # quick cheap check.\n return True\n\n if isinstance(other, (list, tuple)):\n return any(a != b for a, b in zip(self[:], other))\n\n elif isinstance(other, Column):\n if self.pages == other.pages: # special case.\n return False\n\n # are the pages of same size?\n if len(self.pages) == len(other.pages):\n if [p.len for p in self.pages] == [p.len for p in other.pages]:\n for a, b in zip(self.pages, other.pages):\n if not (a.get() == b.get()).all():\n return True\n return False\n # to bad. Element comparison it is then:\n for a, b in zip(iter(self), iter(other)):\n if a != b:\n return True\n return False\n\n elif isinstance(other, np.ndarray):\n start, end = 0, 0\n for p in self.pages:\n start, end = end, end + p.len\n if (p.get() != other[start:end]).any():\n return True\n return False\n else:\n raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n "},{"location":"reference/base/#tablite.base.Column.copy","title":"tablite.base.Column.copy() ","text":"returns deep=copy of Column RETURNS DESCRIPTION Column Source code in tablite/base.py def copy(self):\n \"\"\"returns deep=copy of Column\n\n Returns:\n Column\n \"\"\"\n cp = Column(path=self.path)\n cp.pages = self.pages[:]\n return cp\n "},{"location":"reference/base/#tablite.base.Column.__copy__","title":"tablite.base.Column.__copy__() ","text":"see copy Source code in tablite/base.py def __copy__(self):\n \"\"\"see copy\"\"\"\n return self.copy()\n "},{"location":"reference/base/#tablite.base.Column.__imul__","title":"tablite.base.Column.__imul__(other) ","text":"Repeats instance of column N times. Like list() * N Example: >>> one = Column(data=[1,2])\n>>> one *= 5\n>>> one\n[1,2, 1,2, 1,2, 1,2, 1,2]\n Source code in tablite/base.py def __imul__(self, other):\n \"\"\"\n Repeats instance of column N times. Like list() * N\n\n Example:\n ```\n >>> one = Column(data=[1,2])\n >>> one *= 5\n >>> one\n [1,2, 1,2, 1,2, 1,2, 1,2]\n ```\n \"\"\"\n if not (isinstance(other, int) and other > 0):\n raise TypeError(\n f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n )\n self.pages = self.pages[:] * other\n return self\n "},{"location":"reference/base/#tablite.base.Column.__mul__","title":"tablite.base.Column.__mul__(other) ","text":"Repeats instance of column N times. Like list() * N Example: >>> one = Column(data=[1,2])\n>>> two = one * 5\n>>> two\n[1,2, 1,2, 1,2, 1,2, 1,2]\n Source code in tablite/base.py def __mul__(self, other):\n \"\"\"\n Repeats instance of column N times. Like list() * N\n\n Example:\n ```\n >>> one = Column(data=[1,2])\n >>> two = one * 5\n >>> two\n [1,2, 1,2, 1,2, 1,2, 1,2]\n ```\n \"\"\"\n if not isinstance(other, int):\n raise TypeError(\n f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n )\n cp = self.copy()\n cp *= other\n return cp\n "},{"location":"reference/base/#tablite.base.Column.__iadd__","title":"tablite.base.Column.__iadd__(other) ","text":"Source code in tablite/base.py def __iadd__(self, other):\n if isinstance(other, (list, tuple)):\n other = list_to_np_array(other)\n self.extend(other)\n elif isinstance(other, Column):\n self.pages.extend(other.pages[:])\n else:\n raise TypeError(f\"{type(other)} not supported.\")\n return self\n "},{"location":"reference/base/#tablite.base.Column.__contains__","title":"tablite.base.Column.__contains__(item) ","text":"determines if item is in the Column. Similar to 'x' in ['a','b','c'] returns boolean PARAMETER DESCRIPTION item value to search for TYPE: any RETURNS DESCRIPTION bool True if item exists in column. Source code in tablite/base.py def __contains__(self, item):\n \"\"\"determines if item is in the Column.\n Similar to `'x' in ['a','b','c']`\n returns boolean\n\n Args:\n item (any): value to search for\n\n Returns:\n bool: True if item exists in column.\n \"\"\"\n for page in set(self.pages):\n if item in page.get(): # x in np.ndarray([...]) uses np.any(arr, value)\n return True\n return False\n "},{"location":"reference/base/#tablite.base.Column.remove_all","title":"tablite.base.Column.remove_all(*values) ","text":"removes all values of values Source code in tablite/base.py def remove_all(self, *values):\n \"\"\"\n removes all values of `values`\n \"\"\"\n type_check(values, tuple)\n if isinstance(values[0], tuple):\n values = values[0]\n to_remove = list_to_np_array(values)\n for index, page in enumerate(self.pages):\n data = page.get()\n bitmask = np.isin(data, to_remove) # identify elements to remove.\n if bitmask.any():\n bitmask = np.invert(bitmask) # turn bitmask around to keep.\n new_data = np.compress(bitmask, data)\n new_page = Page(self.path, new_data)\n self.pages[index] = new_page\n "},{"location":"reference/base/#tablite.base.Column.replace","title":"tablite.base.Column.replace(mapping) ","text":"replaces values using a mapping. PARAMETER DESCRIPTION mapping {value to replace: new value, ...} TYPE: dict Example: >>> t = Table(columns={'A': [1,2,3,4]})\n>>> t['A'].replace({2:20,4:40})\n>>> t[:]\nnp.ndarray([1,20,3,40])\n Source code in tablite/base.py def replace(self, mapping):\n \"\"\"\n replaces values using a mapping.\n\n Args:\n mapping (dict): {value to replace: new value, ...}\n\n Example:\n ```\n >>> t = Table(columns={'A': [1,2,3,4]})\n >>> t['A'].replace({2:20,4:40})\n >>> t[:]\n np.ndarray([1,20,3,40])\n ```\n \"\"\"\n type_check(mapping, dict)\n to_replace = np.array(list(mapping.keys()))\n for index, page in enumerate(self.pages):\n data = page.get()\n bitmask = np.isin(data, to_replace) # identify elements to replace.\n if bitmask.any():\n warray = np.compress(bitmask, data)\n py_dtype = page.dtype\n for ix, v in enumerate(warray):\n old_py_val = numpy_to_python(v)\n new_py_val = mapping[old_py_val]\n old_dt = type(old_py_val)\n new_dt = type(new_py_val)\n\n warray[ix] = new_py_val\n\n py_dtype[new_dt] = py_dtype.get(new_dt, 0) + 1\n py_dtype[old_dt] = py_dtype.get(old_dt, 0) - 1\n\n if py_dtype[old_dt] <= 0:\n del py_dtype[old_dt]\n\n data[bitmask] = warray\n self.pages[index] = Page(path=self.path, array=data)\n "},{"location":"reference/base/#tablite.base.Column.types","title":"tablite.base.Column.types() ","text":"returns dict with python datatypes RETURNS DESCRIPTION dict frequency of occurrence of python datatypes Source code in tablite/base.py def types(self):\n \"\"\"\n returns dict with python datatypes\n\n Returns:\n dict: frequency of occurrence of python datatypes\n \"\"\"\n d = Counter()\n for page in self.pages:\n assert isinstance(page.dtype, dict)\n d += page.dtype\n return dict(d)\n "},{"location":"reference/base/#tablite.base.Column.index","title":"tablite.base.Column.index() ","text":"returns dict with { unique entry : list of indices } example: >>> c = Column(data=['a','b','a','c','b'])\n>>> c.index()\n{'a':[0,2], 'b': [1,4], 'c': [3]}\n Source code in tablite/base.py def index(self):\n \"\"\"\n returns dict with { unique entry : list of indices }\n\n example:\n ```\n >>> c = Column(data=['a','b','a','c','b'])\n >>> c.index()\n {'a':[0,2], 'b': [1,4], 'c': [3]}\n ```\n \"\"\"\n d = defaultdict(list)\n for ix, v in enumerate(self.__iter__()):\n d[v].append(ix)\n return dict(d)\n "},{"location":"reference/base/#tablite.base.Column.unique","title":"tablite.base.Column.unique() ","text":"returns unique list of values. example: >>> c = Column(data=['a','b','a','c','b'])\n>>> c.unqiue()\n['a','b','c']\n Source code in tablite/base.py def unique(self):\n \"\"\"\n returns unique list of values.\n\n example:\n ```\n >>> c = Column(data=['a','b','a','c','b'])\n >>> c.unqiue()\n ['a','b','c']\n ```\n \"\"\"\n arrays = []\n for page in set(self.pages):\n try: # when it works, numpy is fast...\n arrays.append(np.unique(page.get()))\n except TypeError: # ...but np.unique cannot handle Nones.\n arrays.append(multitype_set(page.get()))\n union = np_type_unify(arrays)\n try:\n return np.unique(union)\n except MemoryError:\n return np.array(set(union))\n except TypeError:\n return multitype_set(union)\n "},{"location":"reference/base/#tablite.base.Column.histogram","title":"tablite.base.Column.histogram() ","text":"returns 2 arrays: unique elements and count of each element example: >>> c = Column(data=['a','b','a','c','b'])\n>>> c.histogram()\n{'a':2,'b':2,'c':1}\n Source code in tablite/base.py def histogram(self):\n \"\"\"\n returns 2 arrays: unique elements and count of each element\n\n example:\n ```\n >>> c = Column(data=['a','b','a','c','b'])\n >>> c.histogram()\n {'a':2,'b':2,'c':1}\n ```\n \"\"\"\n d = defaultdict(int)\n for page in self.pages:\n try:\n uarray, carray = np.unique(page.get(), return_counts=True)\n except TypeError:\n uarray = page.get()\n carray = repeat(1, len(uarray))\n\n for i, c in zip(uarray, carray):\n v = numpy_to_python(i)\n d[(type(v), v)] += numpy_to_python(c)\n u = [v for _, v in d.keys()]\n c = list(d.values())\n return u, c # unique, counts\n "},{"location":"reference/base/#tablite.base.Column.statistics","title":"tablite.base.Column.statistics() ","text":"provides summary statistics. RETURNS DESCRIPTION dict returns dict with: - min (int/float, length of str, date)
- max (int/float, length of str, date)
- mean (int/float, length of str, date)
- median (int/float, length of str, date)
- stdev (int/float, length of str, date)
- mode (int/float, length of str, date)
- distinct (int/float, length of str, date)
- iqr (int/float, length of str, date)
- sum (int/float, length of str, date)
- histogram (see .histogram)
Source code in tablite/base.py def statistics(self):\n \"\"\"provides summary statistics.\n\n Returns:\n dict: returns dict with:\n - min (int/float, length of str, date)\n - max (int/float, length of str, date)\n - mean (int/float, length of str, date)\n - median (int/float, length of str, date)\n - stdev (int/float, length of str, date)\n - mode (int/float, length of str, date)\n - distinct (int/float, length of str, date)\n - iqr (int/float, length of str, date)\n - sum (int/float, length of str, date)\n - histogram (see .histogram)\n \"\"\"\n values, counts = self.histogram()\n return summary_statistics(values, counts)\n "},{"location":"reference/base/#tablite.base.Column.count","title":"tablite.base.Column.count(item) ","text":"counts appearances of item in column. Note that in python, True == 1 and False == 0 , whereby the following difference occurs: in python: >>> L = [1, True]\n>>> L.count(True)\n2\n in tablite: >>> t = Table({'L': [1,True]})\n>>> t['L'].count(True)\n1\n PARAMETER DESCRIPTION item target item TYPE: Any RETURNS DESCRIPTION int number of occurrences of item. Source code in tablite/base.py def count(self, item):\n \"\"\"counts appearances of item in column.\n\n Note that in python, `True == 1` and `False == 0`,\n whereby the following difference occurs:\n\n in python:\n ```\n >>> L = [1, True]\n >>> L.count(True)\n 2\n ```\n in tablite:\n ```\n >>> t = Table({'L': [1,True]})\n >>> t['L'].count(True)\n 1\n ```\n\n Args:\n item (Any): target item\n\n Returns:\n int: number of occurrences of item.\n \"\"\"\n result = 0\n for page in self.pages:\n data = page.get()\n if data.dtype != \"O\":\n result += np.nonzero(page.get() == item)[0].shape[0]\n # what happens here ---^ below:\n # arr = page.get()\n # >>> arr\n # array([1,2,3,4,3], int64)\n # >>> (arr == 3)\n # array([False, False, True, False, True])\n # >>> np.nonzero(arr==3)\n # (array([2,4], dtype=int64), ) <-- tuple!\n # >>> np.nonzero(page.get() == item)[0]\n # array([2,4])\n # >>> np.nonzero(page.get() == item)[0].shape\n # (2, )\n # >>> np.nonzero(page.get() == item)[0].shape[0]\n # 2\n else:\n result += sum(1 for i in data if type(i) == type(item) and i == item)\n return result\n "},{"location":"reference/base/#tablite.base.BaseTable","title":"tablite.base.BaseTable(columns: [dict, None] = None, headers: [list, None] = None, rows: [list, None] = None, _path: [Path, None] = None) ","text":" Bases: object creates Table PARAMETER DESCRIPTION EITHER columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]}) _path path to main process working directory. TYPE: Path DEFAULT: None Source code in tablite/base.py def __init__(\n self,\n columns: [dict, None] = None,\n headers: [list, None] = None,\n rows: [list, None] = None,\n _path: [Path, None] = None,\n) -> None:\n \"\"\"creates Table\n\n Args:\n EITHER:\n columns (dict, optional): dict with column names as keys, values as lists.\n Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n OR\n headers (list of strings, optional): list of column names.\n rows (list of tuples or lists, optional): values for columns\n Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n\n _path (pathlib.Path, optional): path to main process working directory.\n \"\"\"\n if _path is None:\n if self._pid_dir is None:\n self._pid_dir = Path(Config.workdir) / Config.pid\n if not self._pid_dir.exists():\n self._pid_dir.mkdir()\n (self._pid_dir / \"pages\").mkdir()\n register(self._pid_dir)\n\n _path = Path(self._pid_dir)\n # if path exists under the given PID it will be overwritten.\n # this can only happen if the process previously was SIGKILLed.\n type_check(_path, Path)\n self.path = _path # filename used during multiprocessing.\n self.columns = {} # maps colunn names to instances of Column.\n\n # user friendly features.\n if columns and any((headers, rows)):\n raise ValueError(\"Either columns as dict OR headers and rows. Not both.\")\n\n if headers and rows:\n rotated = list(zip(*rows))\n columns = {k: v for k, v in zip(headers, rotated)}\n\n if columns:\n type_check(columns, dict)\n for k, v in columns.items():\n self.__setitem__(k, v)\n "},{"location":"reference/base/#tablite.base.BaseTable-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.BaseTable.path","title":"tablite.base.BaseTable.path = _path instance-attribute ","text":""},{"location":"reference/base/#tablite.base.BaseTable.columns","title":"tablite.base.BaseTable.columns = {} instance-attribute ","text":""},{"location":"reference/base/#tablite.base.BaseTable.rows","title":"tablite.base.BaseTable.rows property ","text":"enables row based iteration in python types. Example: for row in Table.rows:\n print(row)\n Yields: tuple: values is same order as columns. "},{"location":"reference/base/#tablite.base.BaseTable-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.BaseTable.__str__","title":"tablite.base.BaseTable.__str__() ","text":"Source code in tablite/base.py def __str__(self): # USER FUNCTION.\n return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n "},{"location":"reference/base/#tablite.base.BaseTable.__repr__","title":"tablite.base.BaseTable.__repr__() ","text":"Source code in tablite/base.py def __repr__(self):\n return self.__str__()\n "},{"location":"reference/base/#tablite.base.BaseTable.nbytes","title":"tablite.base.BaseTable.nbytes() ","text":"finds the total bytes of the table on disk RETURNS DESCRIPTION tuple int: real bytes used on disk int: total bytes used if flattened Source code in tablite/base.py def nbytes(self): # USER FUNCTION.\n \"\"\"finds the total bytes of the table on disk\n\n Returns:\n tuple:\n int: real bytes used on disk\n int: total bytes used if flattened\n \"\"\"\n real = {}\n total = 0\n for column in self.columns.values():\n for page in set(column.pages):\n real[page] = page.path.stat().st_size\n for page in column.pages:\n total += real[page]\n return sum(real.values()), total\n "},{"location":"reference/base/#tablite.base.BaseTable.items","title":"tablite.base.BaseTable.items() ","text":"returns table as dict RETURNS DESCRIPTION dict Table as dict {column_name: [values], ...} Source code in tablite/base.py def items(self): # USER FUNCTION.\n \"\"\"returns table as dict\n\n Returns:\n dict: Table as dict `{column_name: [values], ...}`\n \"\"\"\n return {\n name: column[:].tolist() for name, column in self.columns.items()\n }.items()\n "},{"location":"reference/base/#tablite.base.BaseTable.__delitem__","title":"tablite.base.BaseTable.__delitem__(key) ","text":"Examples: >>> del table['a'] # removes column 'a'\n>>> del table[-3:] # removes last 3 rows from all columns.\n Source code in tablite/base.py def __delitem__(self, key): # USER FUNCTION.\n \"\"\"\n Examples:\n ```\n >>> del table['a'] # removes column 'a'\n >>> del table[-3:] # removes last 3 rows from all columns.\n ```\n \"\"\"\n if isinstance(key, (int, slice)):\n for column in self.columns.values():\n del column[key]\n elif key in self.columns:\n del self.columns[key]\n else:\n raise KeyError(f\"Key not found: {key}\")\n "},{"location":"reference/base/#tablite.base.BaseTable.__setitem__","title":"tablite.base.BaseTable.__setitem__(key, value) ","text":"table behaves like a dict. Args: key (str or hashable): column name value (iterable): list, tuple or nd.array with values. As Table now accepts the keyword columns as a dict: >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n and the header/data combinations: >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n This has the side-benefit that tuples now can be used as headers. Source code in tablite/base.py def __setitem__(self, key, value): # USER FUNCTION\n \"\"\"table behaves like a dict.\n Args:\n key (str or hashable): column name\n value (iterable): list, tuple or nd.array with values.\n\n As Table now accepts the keyword `columns` as a dict:\n ```\n >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n ```\n and the header/data combinations:\n ```\n >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n ```\n This has the side-benefit that tuples now can be used as headers.\n \"\"\"\n if value is None:\n self.columns[key] = Column(self.path, value=None)\n elif isinstance(value, (list, tuple)):\n value = list_to_np_array(value)\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, (np.ndarray)):\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, Column):\n self.columns[key] = value\n else:\n raise TypeError(f\"{type(value)} not supported.\")\n "},{"location":"reference/base/#tablite.base.BaseTable.__getitem__","title":"tablite.base.BaseTable.__getitem__(keys) ","text":"Enables selection of columns and rows PARAMETER DESCRIPTION keys TYPE: column name, integer or slice Examples >>> 10] selects first 10 rows from all columns TYPE: table[ >>> 20:3] selects column 'b' and 'c' and 'a' twice for a slice. TYPE: table['b', 'a', 'a', 'c', 2 Raises: KeyError: if key is not found. TypeError: if key is not a string, integer or slice. RETURNS DESCRIPTION Table returns columns in same order as selection. Source code in tablite/base.py def __getitem__(self, keys): # USER FUNCTION\n \"\"\"\n Enables selection of columns and rows\n\n Args:\n keys (column name, integer or slice):\n Examples:\n ```\n >>> table['a'] selects column 'a'\n >>> table[3] selects row 3 as a tuple.\n >>> table[:10] selects first 10 rows from all columns\n >>> table['a','b', slice(3,20,2)] selects a slice from columns 'a' and 'b'\n >>> table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n >>> table[('b', 'a', 'a', 'c')] selects columns 'b', 'a', 'a', and 'c' using a tuple.\n ```\n Raises:\n KeyError: if key is not found.\n TypeError: if key is not a string, integer or slice.\n\n Returns:\n Table: returns columns in same order as selection.\n \"\"\"\n\n if not isinstance(keys, tuple):\n if isinstance(keys, list):\n keys = tuple(keys)\n else:\n keys = (keys,)\n if isinstance(keys[0], tuple):\n keys = tuple(list(chain(*keys)))\n\n integers = [i for i in keys if isinstance(i, int)]\n if len(integers) == len(keys) == 1: # return a single tuple.\n keys = [slice(keys[0])]\n\n column_names = [i for i in keys if isinstance(i, str)]\n column_names = list(self.columns) if not column_names else column_names\n not_found = [name for name in column_names if name not in self.columns]\n if not_found:\n raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n slices = [i for i in keys if isinstance(i, slice)]\n slc = slice(0, len(self)) if not slices else slices[0]\n\n if (\n len(slices) == 0 and len(column_names) == 1\n ): # e.g. tbl['a'] or tbl['a'][:10]\n col = self.columns[column_names[0]]\n if slices:\n return col[slc] # return slice from column as list of values\n else:\n return col # return whole column\n\n elif len(integers) == 1: # return a single tuple.\n row_no = integers[0]\n slc = slice(row_no, row_no + 1)\n return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n elif not slices: # e.g. new table with N whole columns.\n return self.__class__(\n columns={name: self.columns[name] for name in column_names}\n )\n\n else: # e.g. new table from selection of columns and slices.\n t = self.__class__()\n for name in column_names:\n column = self.columns[name]\n\n new_column = Column(t.path) # create new Column.\n for item in column.getpages(slc):\n if isinstance(item, np.ndarray):\n new_column.extend(item) # extend subslice (expensive)\n elif isinstance(item, SimplePage):\n new_column.pages.append(item) # extend page (cheap)\n else:\n raise TypeError(f\"Bad item: {item}\")\n\n # below:\n # set the new column directly on t.columns.\n # Do not use t[name] as that triggers __setitem__ again.\n t.columns[name] = new_column\n\n return t\n "},{"location":"reference/base/#tablite.base.BaseTable.__len__","title":"tablite.base.BaseTable.__len__() ","text":"Source code in tablite/base.py def __len__(self): # USER FUNCTION.\n if not self.columns:\n return 0\n return max(len(c) for c in self.columns.values())\n "},{"location":"reference/base/#tablite.base.BaseTable.__eq__","title":"tablite.base.BaseTable.__eq__(other) -> bool ","text":"Determines if two tables have identical content. PARAMETER DESCRIPTION other table for comparison TYPE: Table RETURNS DESCRIPTION bool True if tables are identical. TYPE: bool Source code in tablite/base.py def __eq__(self, other) -> bool: # USER FUNCTION.\n \"\"\"Determines if two tables have identical content.\n\n Args:\n other (Table): table for comparison\n\n Returns:\n bool: True if tables are identical.\n \"\"\"\n if isinstance(other, dict):\n return self.items() == other.items()\n if not isinstance(other, BaseTable):\n return False\n if id(self) == id(other):\n return True\n if len(self) != len(other):\n return False\n if len(self) == len(other) == 0:\n return True\n if self.columns.keys() != other.columns.keys():\n return False\n for name, col in self.columns.items():\n if not (col == other.columns[name]):\n return False\n return True\n "},{"location":"reference/base/#tablite.base.BaseTable.clear","title":"tablite.base.BaseTable.clear() ","text":"clears the table. Like dict().clear() Source code in tablite/base.py def clear(self): # USER FUNCTION.\n \"\"\"clears the table. Like dict().clear()\"\"\"\n self.columns.clear()\n "},{"location":"reference/base/#tablite.base.BaseTable.save","title":"tablite.base.BaseTable.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1) ","text":"saves table to compressed tpz file. PARAMETER DESCRIPTION path file destination. TYPE: Path compression_method See zipfile compression methods. Defaults to ZIP_DEFLATED. DEFAULT: ZIP_DEFLATED compression_level See zipfile compression levels. Defaults to 1. DEFAULT: 1 The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files. The zip contains table.yml which provides an overview of the data: --------------------------------------\n%YAML 1.2 yaml version\ncolumns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n----------------------------------------\n Source code in tablite/base.py def save(\n self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n): # USER FUNCTION.\n \"\"\"saves table to compressed tpz file.\n\n Args:\n path (Path): file destination.\n compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n compression_level: See zipfile compression levels. Defaults to 1.\n The default settings produce 80% compression at 10% slowdown.\n\n The file format is as follows:\n .tpz is a gzip archive with table metadata captured as table.yml\n and the necessary set of pages saved as .npy files.\n\n The zip contains table.yml which provides an overview of the data:\n ```\n --------------------------------------\n %YAML 1.2 yaml version\n columns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n ----------------------------------------\n ```\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n type_check(path, Path)\n if path.is_dir():\n raise TypeError(f\"filename needed: {path}\")\n if path.suffix != \".tpz\":\n path = path.parent / (path.parts[-1] + \".tpz\")\n\n # create yaml document\n _page_counter = 0\n d = {}\n cols = {}\n for name, col in self.columns.items():\n type_check(col, Column)\n cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n _page_counter += len(col.pages)\n d[\"columns\"] = cols\n yml = yaml.safe_dump(\n d, sort_keys=False, allow_unicode=True, default_flow_style=None\n )\n\n _file_counter = 0\n with zipfile.ZipFile(\n path, \"w\", compression=compression_method, compresslevel=compression_level\n ) as f:\n log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n f.writestr(\"table.yml\", yml)\n for name, col in self.columns.items():\n for page in set(\n col.pages\n ): # set of pages! remember t *= 1000 repeats t 1000x\n with open(page.path, \"rb\", buffering=0) as raw_io:\n f.writestr(page.path.name, raw_io.read())\n _file_counter += 1\n log.debug(f\"adding Page {page.path}\")\n\n _fields = len(self) * len(self.columns)\n _avg = _fields // _page_counter\n log.debug(\n f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n )\n "},{"location":"reference/base/#tablite.base.BaseTable.load","title":"tablite.base.BaseTable.load(path, tqdm=_tqdm) classmethod ","text":"loads a table from .tpz file. See also Table.save for details on the file format. PARAMETER DESCRIPTION path source file TYPE: Path RETURNS DESCRIPTION Table table in read-only mode. Source code in tablite/base.py @classmethod\ndef load(cls, path, tqdm=_tqdm): # USER FUNCTION.\n \"\"\"loads a table from .tpz file.\n See also Table.save for details on the file format.\n\n Args:\n path (Path): source file\n\n Returns:\n Table: table in read-only mode.\n \"\"\"\n path = Path(path)\n log.debug(f\"loading {path}\")\n with zipfile.ZipFile(path, \"r\") as f:\n yml = f.read(\"table.yml\")\n metadata = yaml.safe_load(yml)\n t = cls()\n\n page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n with tqdm(\n total=page_count,\n desc=f\"loading '{path.name}' file\",\n disable=Config.TQDM_DISABLE,\n ) as pbar:\n for name, d in metadata[\"columns\"].items():\n column = Column(t.path)\n for page in d[\"pages\"]:\n bytestream = io.BytesIO(f.read(page))\n data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n column.extend(data)\n pbar.update(1)\n t.columns[name] = column\n update_access_time(path)\n return t\n "},{"location":"reference/base/#tablite.base.BaseTable.copy","title":"tablite.base.BaseTable.copy() ","text":"Source code in tablite/base.py def copy(self):\n cls = type(self)\n t = cls()\n for name, column in self.columns.items():\n new = Column(t.path)\n new.pages = column.pages[:]\n t.columns[name] = new\n return t\n "},{"location":"reference/base/#tablite.base.BaseTable.__imul__","title":"tablite.base.BaseTable.__imul__(other) ","text":"Repeats instance of table N times. Like list: t = t * N PARAMETER DESCRIPTION other multiplier TYPE: int Source code in tablite/base.py def __imul__(self, other):\n \"\"\"Repeats instance of table N times.\n\n Like list: `t = t * N`\n\n Args:\n other (int): multiplier\n \"\"\"\n if not (isinstance(other, int) and other > 0):\n raise TypeError(\n f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n )\n for col in self.columns.values():\n col *= other\n return self\n "},{"location":"reference/base/#tablite.base.BaseTable.__mul__","title":"tablite.base.BaseTable.__mul__(other) ","text":"Repeat table N times. Like list: new = old * N PARAMETER DESCRIPTION other multiplier TYPE: int RETURNS DESCRIPTION Table Source code in tablite/base.py def __mul__(self, other):\n \"\"\"Repeat table N times.\n Like list: `new = old * N`\n\n Args:\n other (int): multiplier\n\n Returns:\n Table\n \"\"\"\n new = self.copy()\n return new.__imul__(other)\n "},{"location":"reference/base/#tablite.base.BaseTable.__iadd__","title":"tablite.base.BaseTable.__iadd__(other) ","text":"Concatenates tables with same column names. Like list: table_1 += table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION None self is updated. Source code in tablite/base.py def __iadd__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_1 += table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n None: self is updated.\n \"\"\"\n type_check(other, BaseTable)\n for name in self.columns.keys():\n if name not in other.columns:\n raise ValueError(f\"{name} not in other\")\n for name in other.columns.keys():\n if name not in self.columns:\n raise ValueError(f\"{name} missing from self\")\n\n for name, column in self.columns.items():\n other_col = other.columns.get(name, None)\n column.pages.extend(other_col.pages[:])\n return self\n "},{"location":"reference/base/#tablite.base.BaseTable.__add__","title":"tablite.base.BaseTable.__add__(other) ","text":"Concatenates tables with same column names. Like list: table_3 = table_1 + table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION Table Source code in tablite/base.py def __add__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_3 = table_1 + table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n Table\n \"\"\"\n type_check(other, BaseTable)\n cp = self.copy()\n cp += other\n return cp\n "},{"location":"reference/base/#tablite.base.BaseTable.add_rows","title":"tablite.base.BaseTable.add_rows(*args, **kwargs) ","text":"its more efficient to add many rows at once. if both args and kwargs, then args are added first, followed by kwargs. supported cases: >>> t = Table()\n>>> t.add_columns('row','A','B','C')\n>>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n>>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n>>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n>>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n>>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n>>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n>>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n>>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n>>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n>>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n>>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n Source code in tablite/base.py def add_rows(self, *args, **kwargs):\n \"\"\"its more efficient to add many rows at once.\n\n if both args and kwargs, then args are added first, followed by kwargs.\n\n supported cases:\n ```\n >>> t = Table()\n >>> t.add_columns('row','A','B','C')\n >>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n >>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n >>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n >>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n >>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n >>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n >>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n >>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n >>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n >>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n >>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n ```\n\n \"\"\"\n if not BaseTable._add_row_slow_warning:\n warnings.warn(\n \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n )\n BaseTable._add_row_slow_warning = True\n\n if args:\n if not all(isinstance(i, (list, tuple, dict)) for i in args): # 1,4\n args = [args]\n\n if all(isinstance(i, (list, tuple, dict)) for i in args): # 2,3,7,8\n # 1. turn the data into columns:\n\n d = {n: [] for n in self.columns}\n for arg in args:\n if len(arg) != len(self.columns):\n raise ValueError(\n f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n )\n\n if isinstance(arg, dict):\n for k, v in arg.items(): # 7,8\n d[k].append(v)\n\n elif isinstance(arg, (list, tuple)): # 2,3\n for n, v in zip(self.columns, arg):\n d[n].append(v)\n\n else:\n raise TypeError(f\"{arg}?\")\n # 2. extend the columns\n for n, values in d.items():\n col = self.columns[n]\n col.extend(list_to_np_array(values))\n\n if kwargs:\n if isinstance(kwargs, dict):\n if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(list_to_np_array(v))\n else:\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(np.array([v]))\n else:\n raise ValueError(f\"format not recognised: {kwargs}\")\n\n return\n "},{"location":"reference/base/#tablite.base.BaseTable.add_columns","title":"tablite.base.BaseTable.add_columns(*names) ","text":"Adds column names to table. Source code in tablite/base.py def add_columns(self, *names):\n \"\"\"Adds column names to table.\"\"\"\n for name in names:\n self.columns[name] = Column(self.path)\n "},{"location":"reference/base/#tablite.base.BaseTable.add_column","title":"tablite.base.BaseTable.add_column(name, data=None) ","text":"verbose alias for table[name] = data, that checks if name already exists PARAMETER DESCRIPTION name column name TYPE: str data values. Defaults to None. TYPE: list,tuple) DEFAULT: None RAISES DESCRIPTION TypeError name isn't string ValueError name already exists Source code in tablite/base.py def add_column(self, name, data=None):\n \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n Args:\n name (str): column name\n data ((list,tuple), optional): values. Defaults to None.\n\n Raises:\n TypeError: name isn't string\n ValueError: name already exists\n \"\"\"\n if not isinstance(name, str):\n raise TypeError(\"expected name as string\")\n if name in self.columns:\n raise ValueError(f\"{name} already in {self.columns}\")\n self.__setitem__(name, data)\n "},{"location":"reference/base/#tablite.base.BaseTable.stack","title":"tablite.base.BaseTable.stack(other) ","text":"returns the joint stack of tables with overlapping column names. Example: | Table A| + | Table B| = | Table AB |\n| A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n Source code in tablite/base.py def stack(self, other):\n \"\"\"\n returns the joint stack of tables with overlapping column names.\n Example:\n ```\n | Table A| + | Table B| = | Table AB |\n | A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n ```\n \"\"\"\n if not isinstance(other, BaseTable):\n raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n cp = self.copy()\n for name, col2 in other.columns.items():\n if name not in cp.columns:\n cp[name] = [None] * len(self)\n cp[name].pages.extend(col2.pages[:])\n\n for name in self.columns:\n if name not in other.columns:\n if len(cp) > 0:\n cp[name].extend(np.array([None] * len(other)))\n return cp\n "},{"location":"reference/base/#tablite.base.BaseTable.types","title":"tablite.base.BaseTable.types() ","text":"returns nested dict of data types in the form: {column name: {python type class: number of instances }, ... } example: >>> t.types()\n{\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n}\n Source code in tablite/base.py def types(self):\n \"\"\"\n returns nested dict of data types in the form:\n `{column name: {python type class: number of instances }, ... }`\n\n example:\n ```\n >>> t.types()\n {\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n }\n ```\n \"\"\"\n d = {}\n for name, col in self.columns.items():\n assert isinstance(col, Column)\n d[name] = col.types()\n return d\n "},{"location":"reference/base/#tablite.base.BaseTable.display_dict","title":"tablite.base.BaseTable.display_dict(slice_=None, blanks=None, dtype=False) ","text":"helper for creating dict for display. PARAMETER DESCRIPTION slice_ python slice. Defaults to None. TYPE: slice DEFAULT: None blanks fill value for None . Defaults to None. TYPE: optional DEFAULT: None dtype Adds datatype to each column. Defaults to False. TYPE: bool DEFAULT: False RAISES DESCRIPTION TypeError slice_ must be None or slice. RETURNS DESCRIPTION dict from Table. Source code in tablite/base.py def display_dict(self, slice_=None, blanks=None, dtype=False):\n \"\"\"helper for creating dict for display.\n\n Args:\n slice_ (slice, optional): python slice. Defaults to None.\n blanks (optional): fill value for `None`. Defaults to None.\n dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n Raises:\n TypeError: slice_ must be None or slice.\n\n Returns:\n dict: from Table.\n \"\"\"\n if not self.columns:\n print(\"Empty Table\")\n return\n\n def datatype(col): # PRIVATE\n \"\"\"creates label for column datatype.\"\"\"\n types = col.types()\n if len(types) == 0:\n typ = \"empty\"\n elif len(types) == 1:\n dt, _ = types.popitem()\n typ = dt.__name__\n else:\n typ = \"mixed\"\n return typ\n\n row_count_tags = [\"#\", \"~\", \"*\"]\n cols = set(self.columns)\n for n, tag in product(range(1, 6), row_count_tags):\n if n * tag not in cols:\n tag = n * tag\n break\n\n if not isinstance(slice_, (slice, type(None))):\n raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n if isinstance(slice_, slice):\n slc = slice_\n if slice_ is None:\n if len(self) <= 20:\n slc = slice(0, 20, 1)\n else:\n slc = None\n\n n = len(self)\n if slc: # either we want slc or we want everything.\n row_no = list(range(*slc.indices(len(self))))\n data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n for name, col in self.columns.items():\n data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n slc\n ]\n else:\n data = {}\n j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n row_no = (\n [f\"{i:,}\".rjust(j) for i in range(7)]\n + [\"...\"]\n + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n )\n data = {tag: row_no}\n\n for name, col in self.columns.items():\n if len(col) == n:\n row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n else:\n empty = [blanks] * 7\n head = (col[:7].tolist() + empty)[:7]\n tail = (col[n - 7 :].tolist() + empty)[-7:]\n row = head + [\"...\"] + tail\n data[name] = row\n\n if dtype:\n for name, values in data.items():\n if name in self.columns:\n col = self.columns[name]\n values.insert(0, datatype(col))\n else:\n values.insert(0, \"row\")\n\n return data\n "},{"location":"reference/base/#tablite.base.BaseTable.to_ascii","title":"tablite.base.BaseTable.to_ascii(slice_=None, blanks=None, dtype=False) ","text":"returns ascii view of table as string. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def to_ascii(self, slice_=None, blanks=None, dtype=False):\n \"\"\"returns ascii view of table as string.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n\n def adjust(v, length): # PRIVATE FUNCTION\n \"\"\"whitespace justifies field values based on datatype\"\"\"\n if v is None:\n return str(blanks).ljust(length)\n elif isinstance(v, str):\n return v.ljust(length)\n else:\n return str(v).rjust(length)\n\n if not self.columns:\n return str(self)\n\n d = {}\n for name, values in self.display_dict(\n slice_=slice_, blanks=blanks, dtype=dtype\n ).items():\n as_text = [str(v) for v in values] + [str(name)]\n width = max(len(i) for i in as_text)\n new_name = name.center(width, \" \")\n if dtype:\n values[0] = values[0].center(width, \" \")\n d[new_name] = [adjust(v, width) for v in values]\n\n rows = dict_to_rows(d)\n s = []\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n s.append(\"|\" + \"|\".join(rows[0]) + \"|\") # column names\n start = 1\n if dtype:\n s.append(\"|\" + \"|\".join(rows[1]) + \"|\") # datatypes\n start = 2\n\n s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n for row in rows[start:]:\n s.append(\"|\" + \"|\".join(row) + \"|\")\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n if len(set(len(c) for c in self.columns.values())) != 1:\n warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n s.append(warning)\n\n return \"\\n\".join(s)\n "},{"location":"reference/base/#tablite.base.BaseTable.show","title":"tablite.base.BaseTable.show(slice_=None, blanks=None, dtype=False) ","text":"prints ascii view of table. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def show(self, slice_=None, blanks=None, dtype=False):\n \"\"\"prints ascii view of table.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n "},{"location":"reference/base/#tablite.base.BaseTable.to_dict","title":"tablite.base.BaseTable.to_dict(columns=None, slice_=None) ","text":"columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows. returns: dict with columns as keys and lists of values. Example: >>> t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 | 1| 3|\n| 1 | 2| 4|\n+===+===+===+\n>>> t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n Source code in tablite/base.py def to_dict(self, columns=None, slice_=None):\n \"\"\"\n columns: list of column names. Default is None == all columns.\n slice_: slice. Default is None == all rows.\n\n returns: dict with columns as keys and lists of values.\n\n Example:\n ```\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 3|\n | 1 | 2| 4|\n +===+===+===+\n >>> t.to_dict()\n {'a':[1,2], 'b':[3,4]}\n ```\n\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n assert isinstance(slice_, slice)\n\n if columns is None:\n columns = list(self.columns.keys())\n if not isinstance(columns, list):\n raise TypeError(\"expected columns as list of strings\")\n\n return {name: list(self.columns[name][slice_]) for name in columns}\n "},{"location":"reference/base/#tablite.base.BaseTable.as_json_serializable","title":"tablite.base.BaseTable.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None) ","text":"provides a JSON compatible format of the table. PARAMETER DESCRIPTION row_count Label for row counts. Defaults to \"row id\". TYPE: str DEFAULT: 'row id' start_on row counts starts by default on 1. TYPE: int DEFAULT: 1 columns Column names. Defaults to None which returns all columns. TYPE: list of str DEFAULT: None slice_ selector. Defaults to None which returns [:] TYPE: slice DEFAULT: None RETURNS DESCRIPTION JSON serializable dict: All python datatypes have been converted to JSON compliant data. Source code in tablite/base.py def as_json_serializable(\n self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n \"\"\"provides a JSON compatible format of the table.\n\n Args:\n row_count (str, optional): Label for row counts. Defaults to \"row id\".\n start_on (int, optional): row counts starts by default on 1.\n columns (list of str, optional): Column names.\n Defaults to None which returns all columns.\n slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n Returns:\n JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n\n assert isinstance(slice_, slice)\n new = {\"columns\": {}, \"total_rows\": len(self)}\n if row_count is not None:\n new[\"columns\"][row_count] = [\n i + start_on for i in range(*slice_.indices(len(self)))\n ]\n\n d = self.to_dict(columns, slice_=slice_)\n for k, data in d.items():\n new_k = unique_name(\n k, new[\"columns\"]\n ) # used to avoid overwriting the `row id` key.\n new[\"columns\"][new_k] = [\n DataTypes.to_json(v) for v in data\n ] # deal with non-json datatypes.\n return new\n "},{"location":"reference/base/#tablite.base.BaseTable.index","title":"tablite.base.BaseTable.index(*args) ","text":"param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...} Examples: >>> table6 = Table()\n>>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n>>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n >>> table6.index('A') # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n >>> table6.index('A', 'B') # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n Source code in tablite/base.py def index(self, *args):\n \"\"\"\n param: *args: column names\n returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n Examples:\n ```\n >>> table6 = Table()\n >>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n >>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n ```\n\n ```\n >>> table6.index('A') # single key.\n {('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n ```\n\n ```\n >>> table6.index('A', 'B') # multiple keys.\n {('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n ```\n\n \"\"\"\n idx = defaultdict(list)\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in enumerate(zip(*iterators)):\n key = tuple(numpy_to_python(k) for k in key)\n idx[key].append(ix)\n return idx\n "},{"location":"reference/base/#tablite.base.BaseTable.unique_index","title":"tablite.base.BaseTable.unique_index(*args, tqdm=_tqdm) ","text":"generates the index of unique rows given a list of column names PARAMETER DESCRIPTION *args columns names TYPE: any DEFAULT: () tqdm Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION np.array(int64): indices of unique records. Source code in tablite/base.py def unique_index(self, *args, tqdm=_tqdm):\n \"\"\"generates the index of unique rows given a list of column names\n\n Args:\n *args (any): columns names\n tqdm (tqdm, optional): Defaults to _tqdm.\n\n Returns:\n np.array(int64): indices of unique records.\n \"\"\"\n if not args:\n raise ValueError(\"*args (column names) is required\")\n seen = set()\n unique = set()\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n key_hash = hash(tuple(numpy_to_python(k) for k in key))\n if key_hash in seen:\n continue\n else:\n seen.add(key_hash)\n unique.add(ix)\n return np.array(sorted(unique))\n "},{"location":"reference/base/#tablite.base-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.register","title":"tablite.base.register(path) ","text":"registers path in file_registry The method is used by Table during init when the working directory path is set, so that python can clean all temporary files up at exit. PARAMETER DESCRIPTION path typically tmp/tablite-tmp/PID-{os.getpid()} TYPE: Path Source code in tablite/base.py def register(path):\n \"\"\"registers path in file_registry\n\n The method is used by Table during init when the working directory path\n is set, so that python can clean all temporary files up at exit.\n\n Args:\n path (Path): typically tmp/tablite-tmp/PID-{os.getpid()}\n \"\"\"\n global file_registry\n file_registry.add(path)\n "},{"location":"reference/base/#tablite.base.shutdown","title":"tablite.base.shutdown() ","text":"method to clean up temporary files triggered at shutdown. Source code in tablite/base.py def shutdown():\n \"\"\"method to clean up temporary files triggered at shutdown.\"\"\"\n for path in file_registry:\n if Config.pid in str(path): # safety feature to prevent rm -rf /\n log.debug(f\"shutdown: running rmtree({path})\")\n shutil.rmtree(path)\n "},{"location":"reference/config/","title":"Config","text":""},{"location":"reference/config/#tablite.config","title":"tablite.config ","text":""},{"location":"reference/config/#tablite.config-classes","title":"Classes","text":""},{"location":"reference/config/#tablite.config.Config","title":"tablite.config.Config ","text":" Bases: object Config class for Tablite Tables. The default location for the storage is loaded as Config.workdir = pathlib.Path(os.environ.get(\"TABLITE_TMPDIR\", f\"{tempfile.gettempdir()}/tablite-tmp\"))\n to overwrite, first import the config class, then set the new workdir. >>> from tablite import config\n>>> from pathlib import Path\n>>> config.workdir = Path(\"/this/new/location\")\n the new path will now be used for every new table. PAGE_SIZE = 1_000_000 sets the page size limit. Multiprocessing is enabled in one of three modes: AUTO = \"auto\" FALSE = \"sp\" FORCE = \"mp\" MULTIPROCESSING_MODE = AUTO is default. SINGLE_PROCESSING_LIMIT = 1_000_000 when the number of fields (rows x columns) exceed this value, multiprocessing is used. "},{"location":"reference/config/#tablite.config.Config-attributes","title":"Attributes","text":""},{"location":"reference/config/#tablite.config.Config.USE_NIMPORTER","title":"tablite.config.Config.USE_NIMPORTER = os.environ.get('USE_NIMPORTER', 'true').lower() in ['1', 't', 'true', 'y', 'yes'] class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH","title":"tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH = os.environ.get('ALLOW_CSV_READER_FALLTHROUGH', 'true').lower() in ['1', 't', 'true', 'y', 'yes'] class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.NIM_SUPPORTED_CONV_TYPES","title":"tablite.config.Config.NIM_SUPPORTED_CONV_TYPES = ['Windows-1252', 'ISO-8859-1'] class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.workdir","title":"tablite.config.Config.workdir = pathlib.Path(os.environ.get('TABLITE_TMPDIR', f'{tempfile.gettempdir()}/tablite-tmp')) class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.pid","title":"tablite.config.Config.pid = f'pid-{os.getpid()}' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.PAGE_SIZE","title":"tablite.config.Config.PAGE_SIZE = 1000000 class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.ENCODING","title":"tablite.config.Config.ENCODING = 'UTF-8' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.DISK_LIMIT","title":"tablite.config.Config.DISK_LIMIT = int(10000000000.0) class-attribute instance-attribute ","text":"10e9 (10Gb) on 100 Gb disk means raise at 90 Gb disk usage. if DISK_LIMIT <= 0, the check is turned off. "},{"location":"reference/config/#tablite.config.Config.SINGLE_PROCESSING_LIMIT","title":"tablite.config.Config.SINGLE_PROCESSING_LIMIT = 1000000 class-attribute instance-attribute ","text":"when the number of fields (rows x columns) exceed this value, multiprocessing is used. "},{"location":"reference/config/#tablite.config.Config.vpus","title":"tablite.config.Config.vpus = max(os.cpu_count() - 1, 1) class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.AUTO","title":"tablite.config.Config.AUTO = 'auto' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.FALSE","title":"tablite.config.Config.FALSE = 'sp' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.FORCE","title":"tablite.config.Config.FORCE = 'mp' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.MULTIPROCESSING_MODE","title":"tablite.config.Config.MULTIPROCESSING_MODE = AUTO class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.TQDM_DISABLE","title":"tablite.config.Config.TQDM_DISABLE = False class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config-functions","title":"Functions","text":""},{"location":"reference/config/#tablite.config.Config.reset","title":"tablite.config.Config.reset() classmethod ","text":"Resets the config class to original values. Source code in tablite/config.py @classmethod\ndef reset(cls):\n \"\"\"Resets the config class to original values.\"\"\"\n for k, v in _default_values.items():\n setattr(Config, k, v)\n "},{"location":"reference/config/#tablite.config.Config.page_steps","title":"tablite.config.Config.page_steps(length) classmethod ","text":"an iterator that yield start and end in page sizes YIELDS DESCRIPTION tuple start:int, end:int Source code in tablite/config.py @classmethod\ndef page_steps(cls, length):\n \"\"\"an iterator that yield start and end in page sizes\n\n Yields:\n tuple: start:int, end:int\n \"\"\"\n start, end = 0, 0\n for _ in range(0, length + 1, cls.PAGE_SIZE):\n start, end = end, min(end + cls.PAGE_SIZE, length)\n yield start, end\n if end == length:\n return\n "},{"location":"reference/core/","title":"Core","text":""},{"location":"reference/core/#tablite.core","title":"tablite.core ","text":""},{"location":"reference/core/#tablite.core-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.log","title":"tablite.core.log = logging.getLogger(__name__) module-attribute ","text":""},{"location":"reference/core/#tablite.core-classes","title":"Classes","text":""},{"location":"reference/core/#tablite.core.Table","title":"tablite.core.Table(columns=None, headers=None, rows=None, _path=None) ","text":" Bases: BaseTable creates Table PARAMETER DESCRIPTION EITHER columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]}) Source code in tablite/core.py def __init__(self, columns=None, headers=None, rows=None, _path=None) -> None:\n \"\"\"creates Table\n\n Args:\n EITHER:\n columns (dict, optional): dict with column names as keys, values as lists.\n Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n OR\n headers (list of strings, optional): list of column names.\n rows (list of tuples or lists, optional): values for columns\n Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n \"\"\"\n super().__init__(columns, headers, rows, _path)\n "},{"location":"reference/core/#tablite.core.Table-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.Table.path","title":"tablite.core.Table.path = _path instance-attribute ","text":""},{"location":"reference/core/#tablite.core.Table.columns","title":"tablite.core.Table.columns = {} instance-attribute ","text":""},{"location":"reference/core/#tablite.core.Table.rows","title":"tablite.core.Table.rows property ","text":"enables row based iteration in python types. Example: for row in Table.rows:\n print(row)\n Yields: tuple: values is same order as columns. "},{"location":"reference/core/#tablite.core.Table-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core.Table.__str__","title":"tablite.core.Table.__str__() ","text":"Source code in tablite/base.py def __str__(self): # USER FUNCTION.\n return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n "},{"location":"reference/core/#tablite.core.Table.__repr__","title":"tablite.core.Table.__repr__() ","text":"Source code in tablite/base.py def __repr__(self):\n return self.__str__()\n "},{"location":"reference/core/#tablite.core.Table.nbytes","title":"tablite.core.Table.nbytes() ","text":"finds the total bytes of the table on disk RETURNS DESCRIPTION tuple int: real bytes used on disk int: total bytes used if flattened Source code in tablite/base.py def nbytes(self): # USER FUNCTION.\n \"\"\"finds the total bytes of the table on disk\n\n Returns:\n tuple:\n int: real bytes used on disk\n int: total bytes used if flattened\n \"\"\"\n real = {}\n total = 0\n for column in self.columns.values():\n for page in set(column.pages):\n real[page] = page.path.stat().st_size\n for page in column.pages:\n total += real[page]\n return sum(real.values()), total\n "},{"location":"reference/core/#tablite.core.Table.items","title":"tablite.core.Table.items() ","text":"returns table as dict RETURNS DESCRIPTION dict Table as dict {column_name: [values], ...} Source code in tablite/base.py def items(self): # USER FUNCTION.\n \"\"\"returns table as dict\n\n Returns:\n dict: Table as dict `{column_name: [values], ...}`\n \"\"\"\n return {\n name: column[:].tolist() for name, column in self.columns.items()\n }.items()\n "},{"location":"reference/core/#tablite.core.Table.__delitem__","title":"tablite.core.Table.__delitem__(key) ","text":"Examples: >>> del table['a'] # removes column 'a'\n>>> del table[-3:] # removes last 3 rows from all columns.\n Source code in tablite/base.py def __delitem__(self, key): # USER FUNCTION.\n \"\"\"\n Examples:\n ```\n >>> del table['a'] # removes column 'a'\n >>> del table[-3:] # removes last 3 rows from all columns.\n ```\n \"\"\"\n if isinstance(key, (int, slice)):\n for column in self.columns.values():\n del column[key]\n elif key in self.columns:\n del self.columns[key]\n else:\n raise KeyError(f\"Key not found: {key}\")\n "},{"location":"reference/core/#tablite.core.Table.__setitem__","title":"tablite.core.Table.__setitem__(key, value) ","text":"table behaves like a dict. Args: key (str or hashable): column name value (iterable): list, tuple or nd.array with values. As Table now accepts the keyword columns as a dict: >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n and the header/data combinations: >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n This has the side-benefit that tuples now can be used as headers. Source code in tablite/base.py def __setitem__(self, key, value): # USER FUNCTION\n \"\"\"table behaves like a dict.\n Args:\n key (str or hashable): column name\n value (iterable): list, tuple or nd.array with values.\n\n As Table now accepts the keyword `columns` as a dict:\n ```\n >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n ```\n and the header/data combinations:\n ```\n >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n ```\n This has the side-benefit that tuples now can be used as headers.\n \"\"\"\n if value is None:\n self.columns[key] = Column(self.path, value=None)\n elif isinstance(value, (list, tuple)):\n value = list_to_np_array(value)\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, (np.ndarray)):\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, Column):\n self.columns[key] = value\n else:\n raise TypeError(f\"{type(value)} not supported.\")\n "},{"location":"reference/core/#tablite.core.Table.__getitem__","title":"tablite.core.Table.__getitem__(keys) ","text":"Enables selection of columns and rows PARAMETER DESCRIPTION keys TYPE: column name, integer or slice Examples >>> 10] selects first 10 rows from all columns TYPE: table[ >>> 20:3] selects column 'b' and 'c' and 'a' twice for a slice. TYPE: table['b', 'a', 'a', 'c', 2 Raises: KeyError: if key is not found. TypeError: if key is not a string, integer or slice. RETURNS DESCRIPTION Table returns columns in same order as selection. Source code in tablite/base.py def __getitem__(self, keys): # USER FUNCTION\n \"\"\"\n Enables selection of columns and rows\n\n Args:\n keys (column name, integer or slice):\n Examples:\n ```\n >>> table['a'] selects column 'a'\n >>> table[3] selects row 3 as a tuple.\n >>> table[:10] selects first 10 rows from all columns\n >>> table['a','b', slice(3,20,2)] selects a slice from columns 'a' and 'b'\n >>> table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n >>> table[('b', 'a', 'a', 'c')] selects columns 'b', 'a', 'a', and 'c' using a tuple.\n ```\n Raises:\n KeyError: if key is not found.\n TypeError: if key is not a string, integer or slice.\n\n Returns:\n Table: returns columns in same order as selection.\n \"\"\"\n\n if not isinstance(keys, tuple):\n if isinstance(keys, list):\n keys = tuple(keys)\n else:\n keys = (keys,)\n if isinstance(keys[0], tuple):\n keys = tuple(list(chain(*keys)))\n\n integers = [i for i in keys if isinstance(i, int)]\n if len(integers) == len(keys) == 1: # return a single tuple.\n keys = [slice(keys[0])]\n\n column_names = [i for i in keys if isinstance(i, str)]\n column_names = list(self.columns) if not column_names else column_names\n not_found = [name for name in column_names if name not in self.columns]\n if not_found:\n raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n slices = [i for i in keys if isinstance(i, slice)]\n slc = slice(0, len(self)) if not slices else slices[0]\n\n if (\n len(slices) == 0 and len(column_names) == 1\n ): # e.g. tbl['a'] or tbl['a'][:10]\n col = self.columns[column_names[0]]\n if slices:\n return col[slc] # return slice from column as list of values\n else:\n return col # return whole column\n\n elif len(integers) == 1: # return a single tuple.\n row_no = integers[0]\n slc = slice(row_no, row_no + 1)\n return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n elif not slices: # e.g. new table with N whole columns.\n return self.__class__(\n columns={name: self.columns[name] for name in column_names}\n )\n\n else: # e.g. new table from selection of columns and slices.\n t = self.__class__()\n for name in column_names:\n column = self.columns[name]\n\n new_column = Column(t.path) # create new Column.\n for item in column.getpages(slc):\n if isinstance(item, np.ndarray):\n new_column.extend(item) # extend subslice (expensive)\n elif isinstance(item, SimplePage):\n new_column.pages.append(item) # extend page (cheap)\n else:\n raise TypeError(f\"Bad item: {item}\")\n\n # below:\n # set the new column directly on t.columns.\n # Do not use t[name] as that triggers __setitem__ again.\n t.columns[name] = new_column\n\n return t\n "},{"location":"reference/core/#tablite.core.Table.__len__","title":"tablite.core.Table.__len__() ","text":"Source code in tablite/base.py def __len__(self): # USER FUNCTION.\n if not self.columns:\n return 0\n return max(len(c) for c in self.columns.values())\n "},{"location":"reference/core/#tablite.core.Table.__eq__","title":"tablite.core.Table.__eq__(other) -> bool ","text":"Determines if two tables have identical content. PARAMETER DESCRIPTION other table for comparison TYPE: Table RETURNS DESCRIPTION bool True if tables are identical. TYPE: bool Source code in tablite/base.py def __eq__(self, other) -> bool: # USER FUNCTION.\n \"\"\"Determines if two tables have identical content.\n\n Args:\n other (Table): table for comparison\n\n Returns:\n bool: True if tables are identical.\n \"\"\"\n if isinstance(other, dict):\n return self.items() == other.items()\n if not isinstance(other, BaseTable):\n return False\n if id(self) == id(other):\n return True\n if len(self) != len(other):\n return False\n if len(self) == len(other) == 0:\n return True\n if self.columns.keys() != other.columns.keys():\n return False\n for name, col in self.columns.items():\n if not (col == other.columns[name]):\n return False\n return True\n "},{"location":"reference/core/#tablite.core.Table.clear","title":"tablite.core.Table.clear() ","text":"clears the table. Like dict().clear() Source code in tablite/base.py def clear(self): # USER FUNCTION.\n \"\"\"clears the table. Like dict().clear()\"\"\"\n self.columns.clear()\n "},{"location":"reference/core/#tablite.core.Table.save","title":"tablite.core.Table.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1) ","text":"saves table to compressed tpz file. PARAMETER DESCRIPTION path file destination. TYPE: Path compression_method See zipfile compression methods. Defaults to ZIP_DEFLATED. DEFAULT: ZIP_DEFLATED compression_level See zipfile compression levels. Defaults to 1. DEFAULT: 1 The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files. The zip contains table.yml which provides an overview of the data: --------------------------------------\n%YAML 1.2 yaml version\ncolumns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n----------------------------------------\n Source code in tablite/base.py def save(\n self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n): # USER FUNCTION.\n \"\"\"saves table to compressed tpz file.\n\n Args:\n path (Path): file destination.\n compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n compression_level: See zipfile compression levels. Defaults to 1.\n The default settings produce 80% compression at 10% slowdown.\n\n The file format is as follows:\n .tpz is a gzip archive with table metadata captured as table.yml\n and the necessary set of pages saved as .npy files.\n\n The zip contains table.yml which provides an overview of the data:\n ```\n --------------------------------------\n %YAML 1.2 yaml version\n columns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n ----------------------------------------\n ```\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n type_check(path, Path)\n if path.is_dir():\n raise TypeError(f\"filename needed: {path}\")\n if path.suffix != \".tpz\":\n path = path.parent / (path.parts[-1] + \".tpz\")\n\n # create yaml document\n _page_counter = 0\n d = {}\n cols = {}\n for name, col in self.columns.items():\n type_check(col, Column)\n cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n _page_counter += len(col.pages)\n d[\"columns\"] = cols\n yml = yaml.safe_dump(\n d, sort_keys=False, allow_unicode=True, default_flow_style=None\n )\n\n _file_counter = 0\n with zipfile.ZipFile(\n path, \"w\", compression=compression_method, compresslevel=compression_level\n ) as f:\n log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n f.writestr(\"table.yml\", yml)\n for name, col in self.columns.items():\n for page in set(\n col.pages\n ): # set of pages! remember t *= 1000 repeats t 1000x\n with open(page.path, \"rb\", buffering=0) as raw_io:\n f.writestr(page.path.name, raw_io.read())\n _file_counter += 1\n log.debug(f\"adding Page {page.path}\")\n\n _fields = len(self) * len(self.columns)\n _avg = _fields // _page_counter\n log.debug(\n f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n )\n "},{"location":"reference/core/#tablite.core.Table.load","title":"tablite.core.Table.load(path, tqdm=_tqdm) classmethod ","text":"loads a table from .tpz file. See also Table.save for details on the file format. PARAMETER DESCRIPTION path source file TYPE: Path RETURNS DESCRIPTION Table table in read-only mode. Source code in tablite/base.py @classmethod\ndef load(cls, path, tqdm=_tqdm): # USER FUNCTION.\n \"\"\"loads a table from .tpz file.\n See also Table.save for details on the file format.\n\n Args:\n path (Path): source file\n\n Returns:\n Table: table in read-only mode.\n \"\"\"\n path = Path(path)\n log.debug(f\"loading {path}\")\n with zipfile.ZipFile(path, \"r\") as f:\n yml = f.read(\"table.yml\")\n metadata = yaml.safe_load(yml)\n t = cls()\n\n page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n with tqdm(\n total=page_count,\n desc=f\"loading '{path.name}' file\",\n disable=Config.TQDM_DISABLE,\n ) as pbar:\n for name, d in metadata[\"columns\"].items():\n column = Column(t.path)\n for page in d[\"pages\"]:\n bytestream = io.BytesIO(f.read(page))\n data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n column.extend(data)\n pbar.update(1)\n t.columns[name] = column\n update_access_time(path)\n return t\n "},{"location":"reference/core/#tablite.core.Table.copy","title":"tablite.core.Table.copy() ","text":"Source code in tablite/base.py def copy(self):\n cls = type(self)\n t = cls()\n for name, column in self.columns.items():\n new = Column(t.path)\n new.pages = column.pages[:]\n t.columns[name] = new\n return t\n "},{"location":"reference/core/#tablite.core.Table.__imul__","title":"tablite.core.Table.__imul__(other) ","text":"Repeats instance of table N times. Like list: t = t * N PARAMETER DESCRIPTION other multiplier TYPE: int Source code in tablite/base.py def __imul__(self, other):\n \"\"\"Repeats instance of table N times.\n\n Like list: `t = t * N`\n\n Args:\n other (int): multiplier\n \"\"\"\n if not (isinstance(other, int) and other > 0):\n raise TypeError(\n f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n )\n for col in self.columns.values():\n col *= other\n return self\n "},{"location":"reference/core/#tablite.core.Table.__mul__","title":"tablite.core.Table.__mul__(other) ","text":"Repeat table N times. Like list: new = old * N PARAMETER DESCRIPTION other multiplier TYPE: int RETURNS DESCRIPTION Table Source code in tablite/base.py def __mul__(self, other):\n \"\"\"Repeat table N times.\n Like list: `new = old * N`\n\n Args:\n other (int): multiplier\n\n Returns:\n Table\n \"\"\"\n new = self.copy()\n return new.__imul__(other)\n "},{"location":"reference/core/#tablite.core.Table.__iadd__","title":"tablite.core.Table.__iadd__(other) ","text":"Concatenates tables with same column names. Like list: table_1 += table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION None self is updated. Source code in tablite/base.py def __iadd__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_1 += table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n None: self is updated.\n \"\"\"\n type_check(other, BaseTable)\n for name in self.columns.keys():\n if name not in other.columns:\n raise ValueError(f\"{name} not in other\")\n for name in other.columns.keys():\n if name not in self.columns:\n raise ValueError(f\"{name} missing from self\")\n\n for name, column in self.columns.items():\n other_col = other.columns.get(name, None)\n column.pages.extend(other_col.pages[:])\n return self\n "},{"location":"reference/core/#tablite.core.Table.__add__","title":"tablite.core.Table.__add__(other) ","text":"Concatenates tables with same column names. Like list: table_3 = table_1 + table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION Table Source code in tablite/base.py def __add__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_3 = table_1 + table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n Table\n \"\"\"\n type_check(other, BaseTable)\n cp = self.copy()\n cp += other\n return cp\n "},{"location":"reference/core/#tablite.core.Table.add_rows","title":"tablite.core.Table.add_rows(*args, **kwargs) ","text":"its more efficient to add many rows at once. if both args and kwargs, then args are added first, followed by kwargs. supported cases: >>> t = Table()\n>>> t.add_columns('row','A','B','C')\n>>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n>>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n>>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n>>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n>>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n>>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n>>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n>>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n>>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n>>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n>>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n Source code in tablite/base.py def add_rows(self, *args, **kwargs):\n \"\"\"its more efficient to add many rows at once.\n\n if both args and kwargs, then args are added first, followed by kwargs.\n\n supported cases:\n ```\n >>> t = Table()\n >>> t.add_columns('row','A','B','C')\n >>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n >>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n >>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n >>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n >>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n >>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n >>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n >>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n >>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n >>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n >>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n ```\n\n \"\"\"\n if not BaseTable._add_row_slow_warning:\n warnings.warn(\n \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n )\n BaseTable._add_row_slow_warning = True\n\n if args:\n if not all(isinstance(i, (list, tuple, dict)) for i in args): # 1,4\n args = [args]\n\n if all(isinstance(i, (list, tuple, dict)) for i in args): # 2,3,7,8\n # 1. turn the data into columns:\n\n d = {n: [] for n in self.columns}\n for arg in args:\n if len(arg) != len(self.columns):\n raise ValueError(\n f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n )\n\n if isinstance(arg, dict):\n for k, v in arg.items(): # 7,8\n d[k].append(v)\n\n elif isinstance(arg, (list, tuple)): # 2,3\n for n, v in zip(self.columns, arg):\n d[n].append(v)\n\n else:\n raise TypeError(f\"{arg}?\")\n # 2. extend the columns\n for n, values in d.items():\n col = self.columns[n]\n col.extend(list_to_np_array(values))\n\n if kwargs:\n if isinstance(kwargs, dict):\n if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(list_to_np_array(v))\n else:\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(np.array([v]))\n else:\n raise ValueError(f\"format not recognised: {kwargs}\")\n\n return\n "},{"location":"reference/core/#tablite.core.Table.add_columns","title":"tablite.core.Table.add_columns(*names) ","text":"Adds column names to table. Source code in tablite/base.py def add_columns(self, *names):\n \"\"\"Adds column names to table.\"\"\"\n for name in names:\n self.columns[name] = Column(self.path)\n "},{"location":"reference/core/#tablite.core.Table.add_column","title":"tablite.core.Table.add_column(name, data=None) ","text":"verbose alias for table[name] = data, that checks if name already exists PARAMETER DESCRIPTION name column name TYPE: str data values. Defaults to None. TYPE: list,tuple) DEFAULT: None RAISES DESCRIPTION TypeError name isn't string ValueError name already exists Source code in tablite/base.py def add_column(self, name, data=None):\n \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n Args:\n name (str): column name\n data ((list,tuple), optional): values. Defaults to None.\n\n Raises:\n TypeError: name isn't string\n ValueError: name already exists\n \"\"\"\n if not isinstance(name, str):\n raise TypeError(\"expected name as string\")\n if name in self.columns:\n raise ValueError(f\"{name} already in {self.columns}\")\n self.__setitem__(name, data)\n "},{"location":"reference/core/#tablite.core.Table.stack","title":"tablite.core.Table.stack(other) ","text":"returns the joint stack of tables with overlapping column names. Example: | Table A| + | Table B| = | Table AB |\n| A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n Source code in tablite/base.py def stack(self, other):\n \"\"\"\n returns the joint stack of tables with overlapping column names.\n Example:\n ```\n | Table A| + | Table B| = | Table AB |\n | A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n ```\n \"\"\"\n if not isinstance(other, BaseTable):\n raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n cp = self.copy()\n for name, col2 in other.columns.items():\n if name not in cp.columns:\n cp[name] = [None] * len(self)\n cp[name].pages.extend(col2.pages[:])\n\n for name in self.columns:\n if name not in other.columns:\n if len(cp) > 0:\n cp[name].extend(np.array([None] * len(other)))\n return cp\n "},{"location":"reference/core/#tablite.core.Table.types","title":"tablite.core.Table.types() ","text":"returns nested dict of data types in the form: {column name: {python type class: number of instances }, ... } example: >>> t.types()\n{\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n}\n Source code in tablite/base.py def types(self):\n \"\"\"\n returns nested dict of data types in the form:\n `{column name: {python type class: number of instances }, ... }`\n\n example:\n ```\n >>> t.types()\n {\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n }\n ```\n \"\"\"\n d = {}\n for name, col in self.columns.items():\n assert isinstance(col, Column)\n d[name] = col.types()\n return d\n "},{"location":"reference/core/#tablite.core.Table.display_dict","title":"tablite.core.Table.display_dict(slice_=None, blanks=None, dtype=False) ","text":"helper for creating dict for display. PARAMETER DESCRIPTION slice_ python slice. Defaults to None. TYPE: slice DEFAULT: None blanks fill value for None . Defaults to None. TYPE: optional DEFAULT: None dtype Adds datatype to each column. Defaults to False. TYPE: bool DEFAULT: False RAISES DESCRIPTION TypeError slice_ must be None or slice. RETURNS DESCRIPTION dict from Table. Source code in tablite/base.py def display_dict(self, slice_=None, blanks=None, dtype=False):\n \"\"\"helper for creating dict for display.\n\n Args:\n slice_ (slice, optional): python slice. Defaults to None.\n blanks (optional): fill value for `None`. Defaults to None.\n dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n Raises:\n TypeError: slice_ must be None or slice.\n\n Returns:\n dict: from Table.\n \"\"\"\n if not self.columns:\n print(\"Empty Table\")\n return\n\n def datatype(col): # PRIVATE\n \"\"\"creates label for column datatype.\"\"\"\n types = col.types()\n if len(types) == 0:\n typ = \"empty\"\n elif len(types) == 1:\n dt, _ = types.popitem()\n typ = dt.__name__\n else:\n typ = \"mixed\"\n return typ\n\n row_count_tags = [\"#\", \"~\", \"*\"]\n cols = set(self.columns)\n for n, tag in product(range(1, 6), row_count_tags):\n if n * tag not in cols:\n tag = n * tag\n break\n\n if not isinstance(slice_, (slice, type(None))):\n raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n if isinstance(slice_, slice):\n slc = slice_\n if slice_ is None:\n if len(self) <= 20:\n slc = slice(0, 20, 1)\n else:\n slc = None\n\n n = len(self)\n if slc: # either we want slc or we want everything.\n row_no = list(range(*slc.indices(len(self))))\n data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n for name, col in self.columns.items():\n data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n slc\n ]\n else:\n data = {}\n j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n row_no = (\n [f\"{i:,}\".rjust(j) for i in range(7)]\n + [\"...\"]\n + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n )\n data = {tag: row_no}\n\n for name, col in self.columns.items():\n if len(col) == n:\n row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n else:\n empty = [blanks] * 7\n head = (col[:7].tolist() + empty)[:7]\n tail = (col[n - 7 :].tolist() + empty)[-7:]\n row = head + [\"...\"] + tail\n data[name] = row\n\n if dtype:\n for name, values in data.items():\n if name in self.columns:\n col = self.columns[name]\n values.insert(0, datatype(col))\n else:\n values.insert(0, \"row\")\n\n return data\n "},{"location":"reference/core/#tablite.core.Table.to_ascii","title":"tablite.core.Table.to_ascii(slice_=None, blanks=None, dtype=False) ","text":"returns ascii view of table as string. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def to_ascii(self, slice_=None, blanks=None, dtype=False):\n \"\"\"returns ascii view of table as string.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n\n def adjust(v, length): # PRIVATE FUNCTION\n \"\"\"whitespace justifies field values based on datatype\"\"\"\n if v is None:\n return str(blanks).ljust(length)\n elif isinstance(v, str):\n return v.ljust(length)\n else:\n return str(v).rjust(length)\n\n if not self.columns:\n return str(self)\n\n d = {}\n for name, values in self.display_dict(\n slice_=slice_, blanks=blanks, dtype=dtype\n ).items():\n as_text = [str(v) for v in values] + [str(name)]\n width = max(len(i) for i in as_text)\n new_name = name.center(width, \" \")\n if dtype:\n values[0] = values[0].center(width, \" \")\n d[new_name] = [adjust(v, width) for v in values]\n\n rows = dict_to_rows(d)\n s = []\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n s.append(\"|\" + \"|\".join(rows[0]) + \"|\") # column names\n start = 1\n if dtype:\n s.append(\"|\" + \"|\".join(rows[1]) + \"|\") # datatypes\n start = 2\n\n s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n for row in rows[start:]:\n s.append(\"|\" + \"|\".join(row) + \"|\")\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n if len(set(len(c) for c in self.columns.values())) != 1:\n warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n s.append(warning)\n\n return \"\\n\".join(s)\n "},{"location":"reference/core/#tablite.core.Table.show","title":"tablite.core.Table.show(slice_=None, blanks=None, dtype=False) ","text":"prints ascii view of table. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def show(self, slice_=None, blanks=None, dtype=False):\n \"\"\"prints ascii view of table.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n "},{"location":"reference/core/#tablite.core.Table.to_dict","title":"tablite.core.Table.to_dict(columns=None, slice_=None) ","text":"columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows. returns: dict with columns as keys and lists of values. Example: >>> t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 | 1| 3|\n| 1 | 2| 4|\n+===+===+===+\n>>> t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n Source code in tablite/base.py def to_dict(self, columns=None, slice_=None):\n \"\"\"\n columns: list of column names. Default is None == all columns.\n slice_: slice. Default is None == all rows.\n\n returns: dict with columns as keys and lists of values.\n\n Example:\n ```\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 3|\n | 1 | 2| 4|\n +===+===+===+\n >>> t.to_dict()\n {'a':[1,2], 'b':[3,4]}\n ```\n\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n assert isinstance(slice_, slice)\n\n if columns is None:\n columns = list(self.columns.keys())\n if not isinstance(columns, list):\n raise TypeError(\"expected columns as list of strings\")\n\n return {name: list(self.columns[name][slice_]) for name in columns}\n "},{"location":"reference/core/#tablite.core.Table.as_json_serializable","title":"tablite.core.Table.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None) ","text":"provides a JSON compatible format of the table. PARAMETER DESCRIPTION row_count Label for row counts. Defaults to \"row id\". TYPE: str DEFAULT: 'row id' start_on row counts starts by default on 1. TYPE: int DEFAULT: 1 columns Column names. Defaults to None which returns all columns. TYPE: list of str DEFAULT: None slice_ selector. Defaults to None which returns [:] TYPE: slice DEFAULT: None RETURNS DESCRIPTION JSON serializable dict: All python datatypes have been converted to JSON compliant data. Source code in tablite/base.py def as_json_serializable(\n self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n \"\"\"provides a JSON compatible format of the table.\n\n Args:\n row_count (str, optional): Label for row counts. Defaults to \"row id\".\n start_on (int, optional): row counts starts by default on 1.\n columns (list of str, optional): Column names.\n Defaults to None which returns all columns.\n slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n Returns:\n JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n\n assert isinstance(slice_, slice)\n new = {\"columns\": {}, \"total_rows\": len(self)}\n if row_count is not None:\n new[\"columns\"][row_count] = [\n i + start_on for i in range(*slice_.indices(len(self)))\n ]\n\n d = self.to_dict(columns, slice_=slice_)\n for k, data in d.items():\n new_k = unique_name(\n k, new[\"columns\"]\n ) # used to avoid overwriting the `row id` key.\n new[\"columns\"][new_k] = [\n DataTypes.to_json(v) for v in data\n ] # deal with non-json datatypes.\n return new\n "},{"location":"reference/core/#tablite.core.Table.index","title":"tablite.core.Table.index(*args) ","text":"param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...} Examples: >>> table6 = Table()\n>>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n>>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n >>> table6.index('A') # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n >>> table6.index('A', 'B') # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n Source code in tablite/base.py def index(self, *args):\n \"\"\"\n param: *args: column names\n returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n Examples:\n ```\n >>> table6 = Table()\n >>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n >>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n ```\n\n ```\n >>> table6.index('A') # single key.\n {('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n ```\n\n ```\n >>> table6.index('A', 'B') # multiple keys.\n {('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n ```\n\n \"\"\"\n idx = defaultdict(list)\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in enumerate(zip(*iterators)):\n key = tuple(numpy_to_python(k) for k in key)\n idx[key].append(ix)\n return idx\n "},{"location":"reference/core/#tablite.core.Table.unique_index","title":"tablite.core.Table.unique_index(*args, tqdm=_tqdm) ","text":"generates the index of unique rows given a list of column names PARAMETER DESCRIPTION *args columns names TYPE: any DEFAULT: () tqdm Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION np.array(int64): indices of unique records. Source code in tablite/base.py def unique_index(self, *args, tqdm=_tqdm):\n \"\"\"generates the index of unique rows given a list of column names\n\n Args:\n *args (any): columns names\n tqdm (tqdm, optional): Defaults to _tqdm.\n\n Returns:\n np.array(int64): indices of unique records.\n \"\"\"\n if not args:\n raise ValueError(\"*args (column names) is required\")\n seen = set()\n unique = set()\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n key_hash = hash(tuple(numpy_to_python(k) for k in key))\n if key_hash in seen:\n continue\n else:\n seen.add(key_hash)\n unique.add(ix)\n return np.array(sorted(unique))\n "},{"location":"reference/core/#tablite.core.Table.from_file","title":"tablite.core.Table.from_file(path, columns=None, first_row_has_headers=True, header_row_index=0, encoding=None, start=0, limit=sys.maxsize, sheet=None, guess_datatypes=True, newline='\\n', text_qualifier=None, delimiter=None, strip_leading_and_tailing_whitespace=True, text_escape_openings='', text_escape_closures='', skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -> Table classmethod ","text":" reads path and imports 1 or more tables\n\n REQUIRED\n --------\n path: pathlib.Path or str\n selection of filereader uses path.suffix.\n See `filereaders`.\n\n OPTIONAL\n --------\n columns:\n None: (default) All columns will be imported.\n List: only column names from list will be imported (if present in file)\n e.g. ['A', 'B', 'C', 'D']\n\n datatype is detected using Datatypes.guess(...)\n You can try it out with:\n >> from tablite.datatypes import DataTypes\n >> DataTypes.guess(['001','100'])\n [1,100]\n\n if the format cannot be achieved the read type is kept.\n Excess column names are ignored.\n\n HINT: To get the head of file use:\n >>> from tablite.tools import head\n >>> head = head(path)\n\n first_row_has_headers: boolean\n True: (default) first row is used as column names.\n False: integers are used as column names.\n\n encoding: str. Defaults to None (autodetect using n bytes).\n n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n start: the first line to be read (default: 0)\n\n limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n OPTIONAL FOR EXCEL AND ODS READERS\n ----------------------------------\n\n sheet: sheet name to import (applicable to excel- and ods-reader only)\n e.g. 'sheet_1'\n sheets not found excess names are ignored.\n\n OPTIONAL FOR TEXT READERS\n -------------------------\n guess_datatype: bool\n True: (default) datatypes are guessed using DataTypes.guess(...)\n False: all data is imported as strings.\n\n newline: newline character (applicable to text_reader only)\n str: '\n ' (default) or ' ' text_qualifier: character (applicable to text_reader only)\n None: No text qualifier is used.\n str: \" or '\n\n delimiter: character (applicable to text_reader only)\n None: file suffix is used to determine field delimiter:\n .txt: \"|\"\n .csv: \",\",\n .ssv: \";\"\n .tsv: \" \" (tab)\n\n strip_leading_and_tailing_whitespace: bool:\n True: default\n\n text_escape_openings: (applicable to text_reader only)\n None: default\n str: list of characters such as ([{\n\n text_escape_closures: (applicable to text_reader only)\n None: default\n str: list of characters such as }])\n Source code in tablite/core.py @classmethod\ndef from_file(\n cls,\n path,\n columns=None,\n first_row_has_headers=True,\n header_row_index=0,\n encoding=None,\n start=0,\n limit=sys.maxsize,\n sheet=None,\n guess_datatypes=True,\n newline=\"\\n\",\n text_qualifier=None,\n delimiter=None,\n strip_leading_and_tailing_whitespace=True,\n text_escape_openings=\"\",\n text_escape_closures=\"\",\n skip_empty: ValidSkipEmpty=\"NONE\",\n tqdm=_tqdm,\n) -> \"Table\":\n \"\"\"\n reads path and imports 1 or more tables\n\n REQUIRED\n --------\n path: pathlib.Path or str\n selection of filereader uses path.suffix.\n See `filereaders`.\n\n OPTIONAL\n --------\n columns:\n None: (default) All columns will be imported.\n List: only column names from list will be imported (if present in file)\n e.g. ['A', 'B', 'C', 'D']\n\n datatype is detected using Datatypes.guess(...)\n You can try it out with:\n >> from tablite.datatypes import DataTypes\n >> DataTypes.guess(['001','100'])\n [1,100]\n\n if the format cannot be achieved the read type is kept.\n Excess column names are ignored.\n\n HINT: To get the head of file use:\n >>> from tablite.tools import head\n >>> head = head(path)\n\n first_row_has_headers: boolean\n True: (default) first row is used as column names.\n False: integers are used as column names.\n\n encoding: str. Defaults to None (autodetect using n bytes).\n n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n start: the first line to be read (default: 0)\n\n limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n OPTIONAL FOR EXCEL AND ODS READERS\n ----------------------------------\n\n sheet: sheet name to import (applicable to excel- and ods-reader only)\n e.g. 'sheet_1'\n sheets not found excess names are ignored.\n\n OPTIONAL FOR TEXT READERS\n -------------------------\n guess_datatype: bool\n True: (default) datatypes are guessed using DataTypes.guess(...)\n False: all data is imported as strings.\n\n newline: newline character (applicable to text_reader only)\n str: '\\n' (default) or '\\r\\n'\n\n text_qualifier: character (applicable to text_reader only)\n None: No text qualifier is used.\n str: \" or '\n\n delimiter: character (applicable to text_reader only)\n None: file suffix is used to determine field delimiter:\n .txt: \"|\"\n .csv: \",\",\n .ssv: \";\"\n .tsv: \"\\t\" (tab)\n\n strip_leading_and_tailing_whitespace: bool:\n True: default\n\n text_escape_openings: (applicable to text_reader only)\n None: default\n str: list of characters such as ([{\n\n text_escape_closures: (applicable to text_reader only)\n None: default\n str: list of characters such as }])\n\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n type_check(path, Path)\n\n if not path.exists():\n raise FileNotFoundError(f\"file not found: {path}\")\n\n if not isinstance(start, int) or not 0 <= start <= sys.maxsize:\n raise ValueError(f\"start {start} not in range(0,{sys.maxsize})\")\n\n if not isinstance(limit, int) or not 0 < limit <= sys.maxsize:\n raise ValueError(f\"limit {limit} not in range(0,{sys.maxsize})\")\n\n if not isinstance(first_row_has_headers, bool):\n raise TypeError(\"first_row_has_headers is not bool\")\n\n import_as = path.suffix\n if import_as.startswith(\".\"):\n import_as = import_as[1:]\n\n reader = import_utils.file_readers.get(import_as, None)\n if reader is None:\n raise ValueError(f\"{import_as} is not in supported format: {import_utils.valid_readers}\")\n\n additional_configs = {\"tqdm\": tqdm}\n if reader == import_utils.text_reader:\n # here we inject tqdm, if tqdm is not provided, use generic iterator\n # fmt:off\n config = (path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline,\n guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty,\n delimiter, text_escape_openings, text_escape_closures)\n # fmt:on\n\n elif reader == import_utils.from_html:\n config = (path,)\n elif reader == import_utils.from_hdf5:\n config = (path,)\n\n elif reader == import_utils.excel_reader:\n # config = path, first_row_has_headers, sheet, columns, start, limit\n config = (\n path,\n first_row_has_headers,\n header_row_index,\n sheet,\n columns,\n skip_empty,\n start,\n limit,\n ) # if file length changes - re-import.\n\n if reader == import_utils.ods_reader:\n # path, first_row_has_headers=True, sheet=None, columns=None, start=0, limit=sys.maxsize,\n config = (\n str(path),\n first_row_has_headers,\n header_row_index,\n sheet,\n columns,\n skip_empty,\n start,\n limit,\n ) # if file length changes - re-import.\n\n # At this point the import config seems valid.\n # Now we check if the file already has been imported.\n\n # publish the settings\n return reader(cls, *config, **additional_configs)\n "},{"location":"reference/core/#tablite.core.Table.from_pandas","title":"tablite.core.Table.from_pandas(df) classmethod ","text":"Creates Table using pd.to_dict('list') similar to: >>> import pandas as pd\n>>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n>>> df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n>>> df.to_dict('list')\n{'a': [1, 2, 3], 'b': [4, 5, 6]}\n>>> t = Table.from_dict(df.to_dict('list))\n>>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 4|\n | 1 | 2| 5|\n | 2 | 3| 6|\n +===+===+===+\n Source code in tablite/core.py @classmethod\ndef from_pandas(cls, df):\n \"\"\"\n Creates Table using pd.to_dict('list')\n\n similar to:\n ```\n >>> import pandas as pd\n >>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n >>> df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n >>> df.to_dict('list')\n {'a': [1, 2, 3], 'b': [4, 5, 6]}\n >>> t = Table.from_dict(df.to_dict('list))\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 4|\n | 1 | 2| 5|\n | 2 | 3| 6|\n +===+===+===+\n ```\n \"\"\"\n return import_utils.from_pandas(cls, df)\n "},{"location":"reference/core/#tablite.core.Table.from_hdf5","title":"tablite.core.Table.from_hdf5(path) classmethod ","text":"imports an exported hdf5 table. Source code in tablite/core.py @classmethod\ndef from_hdf5(cls, path):\n \"\"\"\n imports an exported hdf5 table.\n \"\"\"\n return import_utils.from_hdf5(cls, path)\n "},{"location":"reference/core/#tablite.core.Table.from_json","title":"tablite.core.Table.from_json(jsn) classmethod ","text":"Imports table exported using .to_json Source code in tablite/core.py @classmethod\ndef from_json(cls, jsn):\n \"\"\"\n Imports table exported using .to_json\n \"\"\"\n return import_utils.from_json(cls, jsn)\n "},{"location":"reference/core/#tablite.core.Table.to_hdf5","title":"tablite.core.Table.to_hdf5(path) ","text":"creates a copy of the table as hdf5 Source code in tablite/core.py def to_hdf5(self, path):\n \"\"\"\n creates a copy of the table as hdf5\n \"\"\"\n export_utils.to_hdf5(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_pandas","title":"tablite.core.Table.to_pandas() ","text":"returns pandas.DataFrame Source code in tablite/core.py def to_pandas(self):\n \"\"\"\n returns pandas.DataFrame\n \"\"\"\n return export_utils.to_pandas(self)\n "},{"location":"reference/core/#tablite.core.Table.to_sql","title":"tablite.core.Table.to_sql(name) ","text":"generates ANSI-92 compliant SQL. Source code in tablite/core.py def to_sql(self, name):\n \"\"\"\n generates ANSI-92 compliant SQL.\n \"\"\"\n return export_utils.to_sql(self, name) # remove after update to test suite.\n "},{"location":"reference/core/#tablite.core.Table.to_json","title":"tablite.core.Table.to_json() ","text":"returns JSON Source code in tablite/core.py def to_json(self):\n \"\"\"\n returns JSON\n \"\"\"\n return export_utils.to_json(self)\n "},{"location":"reference/core/#tablite.core.Table.to_xlsx","title":"tablite.core.Table.to_xlsx(path) ","text":"exports table to path Source code in tablite/core.py def to_xlsx(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".xlsx\")\n export_utils.excel_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_ods","title":"tablite.core.Table.to_ods(path) ","text":"exports table to path Source code in tablite/core.py def to_ods(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".ods\")\n export_utils.excel_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_csv","title":"tablite.core.Table.to_csv(path) ","text":"exports table to path Source code in tablite/core.py def to_csv(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".csv\")\n export_utils.text_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_tsv","title":"tablite.core.Table.to_tsv(path) ","text":"exports table to path Source code in tablite/core.py def to_tsv(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".tsv\")\n export_utils.text_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_text","title":"tablite.core.Table.to_text(path) ","text":"exports table to path Source code in tablite/core.py def to_text(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".txt\")\n export_utils.text_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_html","title":"tablite.core.Table.to_html(path) ","text":"exports table to path Source code in tablite/core.py def to_html(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".html\")\n export_utils.to_html(self, path)\n "},{"location":"reference/core/#tablite.core.Table.expression","title":"tablite.core.Table.expression(expression) ","text":"filters based on an expression, such as: \"all((A==B, C!=4, 200<D))\"\n which is interpreted using python's compiler to: def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n Source code in tablite/core.py def expression(self, expression):\n \"\"\"\n filters based on an expression, such as:\n\n \"all((A==B, C!=4, 200<D))\"\n\n which is interpreted using python's compiler to:\n\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n \"\"\"\n return redux._filter_using_expression(self, expression)\n "},{"location":"reference/core/#tablite.core.Table.filter","title":"tablite.core.Table.filter(expressions, filter_type='all', tqdm=_tqdm) ","text":"enables filtering across columns for multiple criteria. expressions: str: Expression that can be compiled and executed row by row.\n exampLe: \"all((A==B and C!=4 and 200<D))\"\n\nlist of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\naccepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n filter_type: 'all' or 'any' Source code in tablite/core.py def filter(self, expressions, filter_type=\"all\", tqdm=_tqdm):\n \"\"\"\n enables filtering across columns for multiple criteria.\n\n expressions:\n\n str: Expression that can be compiled and executed row by row.\n exampLe: \"all((A==B and C!=4 and 200<D))\"\n\n list of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\n accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n filter_type: 'all' or 'any'\n \"\"\"\n return redux.filter(self, expressions, filter_type, tqdm)\n "},{"location":"reference/core/#tablite.core.Table.sort_index","title":"tablite.core.Table.sort_index(sort_mode='excel', tqdm=_tqdm, pbar=None, **kwargs) ","text":"helper for methods sort and is_sorted param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort() Source code in tablite/core.py def sort_index(self, sort_mode=\"excel\", tqdm=_tqdm, pbar=None, **kwargs):\n \"\"\"\n helper for methods `sort` and `is_sorted`\n\n param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n param: **kwargs: sort criteria. See Table.sort()\n \"\"\"\n return sortation.sort_index(self, sort_mode, tqdm=tqdm, pbar=pbar, **kwargs)\n "},{"location":"reference/core/#tablite.core.Table.reindex","title":"tablite.core.Table.reindex(index) ","text":"index: list of integers that declare sort order. Examples: Table: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n Source code in tablite/core.py def reindex(self, index):\n \"\"\"\n index: list of integers that declare sort order.\n\n Examples:\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6]\n result: ['b','d','f','h']\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6,1,3,5,7]\n result: ['a','c','e','g','b','d','f','h']\n\n \"\"\"\n if isinstance(index, list):\n index = np.array(index)\n return _reindex.reindex(self, index)\n "},{"location":"reference/core/#tablite.core.Table.drop_duplicates","title":"tablite.core.Table.drop_duplicates(*args) ","text":"removes duplicate rows based on column names args: (optional) column_names if no args, all columns are used. Source code in tablite/core.py def drop_duplicates(self, *args):\n \"\"\"\n removes duplicate rows based on column names\n\n args: (optional) column_names\n if no args, all columns are used.\n \"\"\"\n if not args:\n args = self.columns\n index = self.unique_index(*args)\n return self.reindex(index)\n "},{"location":"reference/core/#tablite.core.Table.sort","title":"tablite.core.Table.sort(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None) ","text":"Perform multi-pass sorting with precedence given order of column names. PARAMETER DESCRIPTION mapping keys as columns, values as boolean for 'reverse' TYPE: dict sort_mode str: \"alphanumeric\", \"unix\", or, \"excel\" DEFAULT: 'excel' RETURNS DESCRIPTION None Table.sort is sorted inplace Examples: Table.sort(mappinp={A':False}) means sort by 'A' in ascending order. Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority) sort B in ascending order. Source code in tablite/core.py def sort(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n \"\"\"Perform multi-pass sorting with precedence given order of column names.\n\n Args:\n mapping (dict): keys as columns,\n values as boolean for 'reverse'\n sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n\n Returns:\n None: Table.sort is sorted inplace\n\n Examples:\n Table.sort(mappinp={A':False}) means sort by 'A' in ascending order.\n Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority)\n sort B in ascending order.\n \"\"\"\n new = sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n self.columns = new.columns\n "},{"location":"reference/core/#tablite.core.Table.sorted","title":"tablite.core.Table.sorted(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None) ","text":"See sort. Sorted returns a new table in contrast to \"sort\", which is in-place. RETURNS DESCRIPTION Table. Source code in tablite/core.py def sorted(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n \"\"\"See sort.\n Sorted returns a new table in contrast to \"sort\", which is in-place.\n\n Returns:\n Table.\n \"\"\"\n return sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.is_sorted","title":"tablite.core.Table.is_sorted(mapping, sort_mode='excel') ","text":"Performs multi-pass sorting check with precedence given order of column names. **kwargs: optional: sort criteria. See Table.sort() :return bool Source code in tablite/core.py def is_sorted(self, mapping, sort_mode=\"excel\"):\n \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n **kwargs: optional: sort criteria. See Table.sort()\n :return bool\n \"\"\"\n return sortation.is_sorted(self, mapping, sort_mode)\n "},{"location":"reference/core/#tablite.core.Table.any","title":"tablite.core.Table.any(**kwargs) ","text":"returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable Source code in tablite/core.py def any(self, **kwargs):\n \"\"\"\n returns Table for rows where ANY kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n \"\"\"\n return redux.filter_any(self, **kwargs)\n "},{"location":"reference/core/#tablite.core.Table.all","title":"tablite.core.Table.all(**kwargs) ","text":"returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable Examples: t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n return x == 4\ndef g(x):\n return x < 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n return x>=2\n\ndef i(x):\n return x<=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n Source code in tablite/core.py def all(self, **kwargs):\n \"\"\"\n returns Table for rows where ALL kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n\n Examples:\n\n t = Table()\n t['a'] = [1,2,3,4]\n t['b'] = [10,20,30,40]\n\n def f(x):\n return x == 4\n def g(x):\n return x < 20\n\n t2 = t.any( **{\"a\":f, \"b\":g})\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n t2 = t.any(a=f,b=g)\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n def h(x):\n return x>=2\n\n def i(x):\n return x<=30\n\n t2 = t.all(a=h,b=i)\n assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n \"\"\"\n return redux.filter_all(self, **kwargs)\n "},{"location":"reference/core/#tablite.core.Table.drop","title":"tablite.core.Table.drop(*args) ","text":"removes all rows where args are present. Exmaple: t = Table() t['A'] = [1,2,3,None] t['B'] = [None,2,3,4] t2 = t.drop(None) t2'A', t2'B' ([2,3], [2,3]) Source code in tablite/core.py def drop(self, *args):\n \"\"\"\n removes all rows where args are present.\n\n Exmaple:\n >>> t = Table()\n >>> t['A'] = [1,2,3,None]\n >>> t['B'] = [None,2,3,4]\n >>> t2 = t.drop(None)\n >>> t2['A'][:], t2['B'][:]\n ([2,3], [2,3])\n\n \"\"\"\n if not args:\n raise ValueError(\"What to drop? None? np.nan? \")\n return redux.drop(self, *args)\n "},{"location":"reference/core/#tablite.core.Table.replace","title":"tablite.core.Table.replace(mapping, columns=None, tqdm=_tqdm, pbar=None) ","text":"replaces all mapped keys with values from named columns PARAMETER DESCRIPTION mapping keys are targets for replacement, values are replacements. TYPE: dict columns target columns. Defaults to None (all columns) TYPE: list or str DEFAULT: None RAISES DESCRIPTION ValueError description Source code in tablite/core.py def replace(self, mapping, columns=None, tqdm=_tqdm, pbar=None):\n \"\"\"replaces all mapped keys with values from named columns\n\n Args:\n mapping (dict): keys are targets for replacement,\n values are replacements.\n columns (list or str, optional): target columns.\n Defaults to None (all columns)\n\n Raises:\n ValueError: _description_\n \"\"\"\n if columns is None:\n columns = list(self.columns)\n if not isinstance(columns, list) and columns in self.columns:\n columns = [columns]\n type_check(columns, list)\n for n in columns:\n if n not in self.columns:\n raise ValueError(f\"column not found: {n}\")\n\n if pbar is None:\n total = len(columns)\n pbar = tqdm(total=total, desc=\"replace\", disable=Config.TQDM_DISABLE)\n\n for name in columns:\n col = self.columns[name]\n col.replace(mapping)\n pbar.update(1)\n "},{"location":"reference/core/#tablite.core.Table.groupby","title":"tablite.core.Table.groupby(keys, functions, tqdm=_tqdm, pbar=None) ","text":"keys: column names for grouping. functions: [optional] list of column names and group functions (See GroupyBy class) returns: table Example: t = Table()\nt.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\nt.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\nt.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\nt.show()\n+=====+=====+=====+\n| A | B | C |\n| int | int | int |\n+-----+-----+-----+\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n+=====+=====+=====+\n\ng = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\ng.show()\n+===+===+===+======+\n| # | A | C |Sum(B)|\n|row|int|int| int |\n+---+---+---+------+\n|0 | 1| 6| 2|\n|1 | 1| 5| 4|\n|2 | 2| 4| 6|\n|3 | 2| 3| 8|\n|4 | 3| 2| 10|\n|5 | 3| 1| 12|\n+===+===+===+======+\n Cheat sheet: list of unique values >>> g1 = t.groupby(keys=['A'], functions=[])\n>>> g1['A'][:]\n[1,2,3]\n alternatively: t['A'].unique() [1,2,3] list of unique values, grouped by longest combination. >>> g2 = t.groupby(keys=['A', 'B'], functions=[])\n>>> g2['A'][:], g2['B'][:]\n([1,1,2,2,3,3], [1,2,3,4,5,6])\n alternatively: >>> list(zip(*t.index('A', 'B').keys()))\n[(1,1,2,2,3,3) (1,2,3,4,5,6)]\n A key (unique values) and count hereof. >>> g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n>>> g3['A'][:], g3['Count(A)'][:]\n([1,2,3], [4,4,4])\n alternatively: >>> t['A'].histogram()\n([1,2,3], [4,4,4])\n for more exmaples see: https://github.com/root-11/tablite/blob/master/tests/test_groupby.py Source code in tablite/core.py def groupby(self, keys, functions, tqdm=_tqdm, pbar=None):\n \"\"\"\n keys: column names for grouping.\n functions: [optional] list of column names and group functions (See GroupyBy class)\n returns: table\n\n Example:\n ```\n t = Table()\n t.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\n t.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\n t.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\n t.show()\n +=====+=====+=====+\n | A | B | C |\n | int | int | int |\n +-----+-----+-----+\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n +=====+=====+=====+\n\n g = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\n g.show()\n +===+===+===+======+\n | # | A | C |Sum(B)|\n |row|int|int| int |\n +---+---+---+------+\n |0 | 1| 6| 2|\n |1 | 1| 5| 4|\n |2 | 2| 4| 6|\n |3 | 2| 3| 8|\n |4 | 3| 2| 10|\n |5 | 3| 1| 12|\n +===+===+===+======+\n ```\n Cheat sheet:\n\n list of unique values\n ```\n >>> g1 = t.groupby(keys=['A'], functions=[])\n >>> g1['A'][:]\n [1,2,3]\n ```\n alternatively:\n >>> t['A'].unique()\n [1,2,3]\n\n list of unique values, grouped by longest combination.\n ```\n >>> g2 = t.groupby(keys=['A', 'B'], functions=[])\n >>> g2['A'][:], g2['B'][:]\n ([1,1,2,2,3,3], [1,2,3,4,5,6])\n ```\n alternatively:\n ```\n >>> list(zip(*t.index('A', 'B').keys()))\n [(1,1,2,2,3,3) (1,2,3,4,5,6)]\n ```\n A key (unique values) and count hereof.\n ```\n >>> g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n >>> g3['A'][:], g3['Count(A)'][:]\n ([1,2,3], [4,4,4])\n ```\n alternatively:\n ```\n >>> t['A'].histogram()\n ([1,2,3], [4,4,4])\n ```\n for more exmaples see:\n https://github.com/root-11/tablite/blob/master/tests/test_groupby.py\n\n \"\"\"\n return _groupby(self, keys, functions, tqdm)\n "},{"location":"reference/core/#tablite.core.Table.pivot","title":"tablite.core.Table.pivot(rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None) ","text":"param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as example: t.show()\n+=====+=====+=====+\n| A | B | C |\n| int | int | int |\n+-----+-----+-----+\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n+=====+=====+=====+\n\nt2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\nt2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int| str |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0 | 6|Sum(B) | 2|None |None |\n|1 | 5|Sum(B) | 4|None |None |\n|2 | 4|Sum(B) |None | 6|None |\n|3 | 3|Sum(B) |None | 8|None |\n|4 | 2|Sum(B) |None |None | 10|\n|5 | 1|Sum(B) |None |None | 12|\n+===+===+========+=====+=====+=====+\n Source code in tablite/core.py def pivot(self, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n \"\"\"\n param: rows: column names to keep as rows\n param: columns: column names to keep as columns\n param: functions: aggregation functions from the Groupby class as\n\n example:\n ```\n t.show()\n +=====+=====+=====+\n | A | B | C |\n | int | int | int |\n +-----+-----+-----+\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n +=====+=====+=====+\n\n t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n t2.show()\n +===+===+========+=====+=====+=====+\n | # | C |function|(A=1)|(A=2)|(A=3)|\n |row|int| str |mixed|mixed|mixed|\n +---+---+--------+-----+-----+-----+\n |0 | 6|Sum(B) | 2|None |None |\n |1 | 5|Sum(B) | 4|None |None |\n |2 | 4|Sum(B) |None | 6|None |\n |3 | 3|Sum(B) |None | 8|None |\n |4 | 2|Sum(B) |None |None | 10|\n |5 | 1|Sum(B) |None |None | 12|\n +===+===+========+=====+=====+=====+\n ```\n \"\"\"\n return pivots.pivot(self, rows, columns, functions, values_as_rows, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.merge","title":"tablite.core.Table.merge(left, right, new, criteria) ","text":"takes from LEFT where criteria is True else RIGHT. :param: T: Table :param: criteria: np.array(bool): if True take left column else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name :returns: T Example: >>> c.show()\n+==+====+====+====+====+\n| #| A | B | C | D |\n+--+----+----+----+----+\n| 0| 1| 10| 1| 11|\n| 1| 2| 20| 2| 12|\n| 2| 3|None| 3| 13|\n| 3|None| 40|None|None|\n| 4| 5| 50|None|None|\n| 5|None|None| 6| 16|\n| 6|None|None| 7| 17|\n+==+====+====+====+====+\n\n>>> c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n>>> c.show()\n+==+====+====+====+\n| #| B | D | E |\n+--+----+----+----+\n| 0| 10| 11| 1|\n| 1| 20| 12| 2|\n| 2|None| 13| 3|\n| 3| 40|None|None|\n| 4| 50|None| 5|\n| 5|None| 16| 6|\n| 6|None| 17| 7|\n+==+====+====+====+\n Source code in tablite/core.py def merge(self, left, right, new, criteria):\n \"\"\" takes from LEFT where criteria is True else RIGHT.\n :param: T: Table\n :param: criteria: np.array(bool): \n if True take left column\n else take right column\n :param left: (str) column name\n :param right: (str) column name\n :param new: (str) new name\n\n :returns: T\n\n Example:\n ```\n >>> c.show()\n +==+====+====+====+====+\n | #| A | B | C | D |\n +--+----+----+----+----+\n | 0| 1| 10| 1| 11|\n | 1| 2| 20| 2| 12|\n | 2| 3|None| 3| 13|\n | 3|None| 40|None|None|\n | 4| 5| 50|None|None|\n | 5|None|None| 6| 16|\n | 6|None|None| 7| 17|\n +==+====+====+====+====+\n\n >>> c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n >>> c.show()\n +==+====+====+====+\n | #| B | D | E |\n +--+----+----+----+\n | 0| 10| 11| 1|\n | 1| 20| 12| 2|\n | 2|None| 13| 3|\n | 3| 40|None|None|\n | 4| 50|None| 5|\n | 5|None| 16| 6|\n | 6|None| 17| 7|\n +==+====+====+====+\n ```\n \"\"\"\n return merge.where(self, criteria,left,right,new)\n "},{"location":"reference/core/#tablite.core.Table.column_select","title":"tablite.core.Table.column_select(cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager) ","text":"type-casts columns from a given table to specified type(s) cols list of dicts: (example): cols = [\n {'column':'A', 'type': 'bool'},\n {'column':'B', 'type': 'int', 'allow_empty': True},\n {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n]\n 'column' : column name of the input table that we want to type-cast 'type' : type that we want to type-cast the specified column to 'allow_empty': should we allow empty values (None, str('')) through (Default: False) 'rename' : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None) supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime' if any of the columns is rejected, entire row is rejected tqdm: progressbar constructor TaskManager: TaskManager constructor (TABLE, TABLE) DESCRIPTION first table contains the rows that were successfully cast to desired types second table contains rows that failed to cast + rejection reason Source code in tablite/core.py def column_select(self, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager):\n \"\"\"\n type-casts columns from a given table to specified type(s)\n\n cols:\n list of dicts: (example):\n\n cols = [\n {'column':'A', 'type': 'bool'},\n {'column':'B', 'type': 'int', 'allow_empty': True},\n {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n ]\n\n 'column' : column name of the input table that we want to type-cast\n 'type' : type that we want to type-cast the specified column to\n 'allow_empty': should we allow empty values (None, str('')) through (Default: False)\n 'rename' : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None)\n\n supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime'\n\n if any of the columns is rejected, entire row is rejected\n\n tqdm: progressbar constructor\n TaskManager: TaskManager constructor\n\n returns: (Table, Table)\n first table contains the rows that were successfully cast to desired types\n second table contains rows that failed to cast + rejection reason\n \"\"\"\n return _column_select(self, cols, tqdm, TaskManager)\n "},{"location":"reference/core/#tablite.core.Table.join","title":"tablite.core.Table.join(other, left_keys, right_keys, left_columns=None, right_columns=None, kind='inner', merge_keys=False, tqdm=_tqdm, pbar=None) ","text":"short-cut for all join functions. kind: 'inner', 'left', 'outer', 'cross' Source code in tablite/core.py def join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, kind=\"inner\", merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n short-cut for all join functions.\n kind: 'inner', 'left', 'outer', 'cross'\n \"\"\"\n kinds = {\n \"inner\": self.inner_join,\n \"left\": self.left_join,\n \"outer\": self.outer_join,\n \"cross\": self.cross_join,\n }\n if kind not in kinds:\n raise ValueError(f\"join type unknown: {kind}\")\n f = kinds.get(kind, None)\n return f(other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.left_join","title":"tablite.core.Table.left_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":":param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example: SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nTablite: left_join = numbers.left_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n)\n Source code in tablite/core.py def left_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n :param other: self, other = (left, right)\n :param left_keys: list of keys for the join\n :param right_keys: list of keys for the join\n :param left_columns: list of left columns to retain, if None, all are retained.\n :param right_columns: list of right columns to retain, if None, all are retained.\n :return: new Table\n Example:\n ```\n SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n Tablite: left_join = numbers.left_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n ```\n \"\"\"\n return joins.left_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.inner_join","title":"tablite.core.Table.inner_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":":param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example: SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\nTablite: inner_join = numbers.inner_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n Source code in tablite/core.py def inner_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n :param other: self, other = (left, right)\n :param left_keys: list of keys for the join\n :param right_keys: list of keys for the join\n :param left_columns: list of left columns to retain, if None, all are retained.\n :param right_columns: list of right columns to retain, if None, all are retained.\n :return: new Table\n Example:\n ```\n SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n Tablite: inner_join = numbers.inner_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n ```\n \"\"\"\n return joins.inner_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.outer_join","title":"tablite.core.Table.outer_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":":param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example: SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nTablite: outer_join = numbers.outer_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n Source code in tablite/core.py def outer_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n :param other: self, other = (left, right)\n :param left_keys: list of keys for the join\n :param right_keys: list of keys for the join\n :param left_columns: list of left columns to retain, if None, all are retained.\n :param right_columns: list of right columns to retain, if None, all are retained.\n :return: new Table\n Example:\n ```\n SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n Tablite: outer_join = numbers.outer_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n ```\n \"\"\"\n return joins.outer_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.cross_join","title":"tablite.core.Table.cross_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":"CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table Source code in tablite/core.py def cross_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n CROSS JOIN returns the Cartesian product of rows from tables in the join.\n In other words, it will produce rows which combine each row from the first table\n with each row from the second table\n \"\"\"\n return joins.cross_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.lookup","title":"tablite.core.Table.lookup(other, *criteria, all=True, tqdm=_tqdm) ","text":"function for looking up values in other according to criteria in ascending order. :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=Any OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare. Examples: ('column A', \"==\", 'column B') # comparison of two columns\n('Date', \"<\", DataTypes.date(24,12) ) # value from column 'Date' is before 24/12.\nf = lambda L,R: all( ord(L) < ord(R) ) # uses custom function.\n('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n Source code in tablite/core.py def lookup(self, other, *criteria, all=True, tqdm=_tqdm):\n \"\"\"function for looking up values in `other` according to criteria in ascending order.\n :param: other: Table sorted in ascending search order.\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n (LEFT, OPERATOR, RIGHT)\n :param: all: boolean: True=ALL, False=Any\n\n OPERATOR must be a callable that returns a boolean\n LEFT must be a value that the OPERATOR can compare.\n RIGHT must be a value that the OPERATOR can compare.\n\n Examples:\n ```\n ('column A', \"==\", 'column B') # comparison of two columns\n ('Date', \"<\", DataTypes.date(24,12) ) # value from column 'Date' is before 24/12.\n f = lambda L,R: all( ord(L) < ord(R) ) # uses custom function.\n ('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n ```\n \"\"\"\n return lookup.lookup(self, other, *criteria, all=all, tqdm=tqdm)\n "},{"location":"reference/core/#tablite.core.Table.match","title":"tablite.core.Table.match(other, *criteria, keep_left=None, keep_right=None) ","text":"performs inner join where T matches other and removes rows that do not match. :param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n :param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep. Source code in tablite/core.py def match(self, other, *criteria, keep_left=None, keep_right=None):\n \"\"\"\n performs inner join where `T` matches `other` and removes rows that do not match.\n\n :param: T: Table\n :param: other: Table\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n Example:\n ('column A', \"==\", 'column B')\n\n This syntax follows the lookup syntax. See Lookup for details.\n\n :param: keep_left: list of columns to keep.\n :param: keep_right: list of right columns to keep.\n \"\"\"\n return match.match(self, other, *criteria, keep_left=keep_left, keep_right=keep_right)\n "},{"location":"reference/core/#tablite.core.Table.replace_missing_values","title":"tablite.core.Table.replace_missing_values(*args, **kwargs) ","text":"Source code in tablite/core.py def replace_missing_values(self, *args, **kwargs):\n raise AttributeError(\"See imputation\")\n "},{"location":"reference/core/#tablite.core.Table.imputation","title":"tablite.core.Table.imputation(targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm) ","text":"In statistics, imputation is the process of replacing missing data with substituted values. See more: https://en.wikipedia.org/wiki/Imputation_(statistics) PARAMETER DESCRIPTION table source table. TYPE: Table targets column names to find and replace missing values TYPE: str or list of strings missing values to be replaced. TYPE: None or iterable DEFAULT: None method method to be used for replacement. Options: 'carry forward': takes the previous value, and carries forward into fields where values are missing. +: quick. Realistic on time series. -: Can produce strange outliers. 'mean': calculates the column mean (exclude missing ) and copies the mean in as replacement. +: quick -: doesn't work on text. Causes data set to drift towards the mean. 'mode': calculates the column mode (exclude missing ) and copies the mean in as replacement. +: quick -: most frequent value becomes over-represented in the sample 'nearest neighbour': calculates normalised distance between items in source columns selects nearest neighbour and copies value as replacement. +: works for any datatype. -: computationally intensive (e.g. slow) TYPE: str DEFAULT: 'carry forward' sources NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used. TYPE: list of strings DEFAULT: None RETURNS DESCRIPTION table table with replaced values. Source code in tablite/core.py def imputation(self, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm):\n \"\"\"\n In statistics, imputation is the process of replacing missing data with substituted values.\n\n See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n Args:\n table (Table): source table.\n\n targets (str or list of strings): column names to find and\n replace missing values\n\n missing (None or iterable): values to be replaced.\n\n method (str): method to be used for replacement. Options:\n\n 'carry forward':\n takes the previous value, and carries forward into fields\n where values are missing.\n +: quick. Realistic on time series.\n -: Can produce strange outliers.\n\n 'mean':\n calculates the column mean (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: doesn't work on text. Causes data set to drift towards the mean.\n\n 'mode':\n calculates the column mode (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: most frequent value becomes over-represented in the sample\n\n 'nearest neighbour':\n calculates normalised distance between items in source columns\n selects nearest neighbour and copies value as replacement.\n +: works for any datatype.\n -: computationally intensive (e.g. slow)\n\n sources (list of strings): NEAREST NEIGHBOUR ONLY\n column names to be used during imputation.\n if None or empty, all columns will be used.\n\n Returns:\n table: table with replaced values.\n \"\"\"\n return imputation.imputation(self, targets, missing, method, sources, tqdm=tqdm)\n "},{"location":"reference/core/#tablite.core.Table.transpose","title":"tablite.core.Table.transpose(tqdm=_tqdm) ","text":"Source code in tablite/core.py def transpose(self, tqdm=_tqdm):\n return pivots.transpose(self, tqdm)\n "},{"location":"reference/core/#tablite.core.Table.pivot_transpose","title":"tablite.core.Table.pivot_transpose(columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm) ","text":"Transpose a selection of columns to rows. PARAMETER DESCRIPTION columns column names to transpose TYPE: list of column names keep column names to keep (repeat) TYPE: list of column names DEFAULT: None RETURNS DESCRIPTION Table with columns transposed to rows Example transpose columns 1,2 and 3 and transpose the remaining columns, except sum . Input: | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n| 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n| ... | | | | | | | | |\n\nt.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun | 456|\n|1234| 2345| 3456| mon | 567|\n|1244| 2445| 4456| mon | 7|\n Source code in tablite/core.py def pivot_transpose(self, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n \"\"\"Transpose a selection of columns to rows.\n\n Args:\n columns (list of column names): column names to transpose\n keep (list of column names): column names to keep (repeat)\n\n Returns:\n Table: with columns transposed to rows\n\n Example:\n transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n Input:\n ```\n | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n |------|------|------|-----|-----|-----|-----|-----|------|\n | 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n | 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n | ... | | | | | | | | |\n\n t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n Output:\n\n |col1| col2| col3| transpose| value|\n |----|-----|-----|----------|------|\n |1234| 2345| 3456| sun | 456|\n |1234| 2345| 3456| mon | 567|\n |1244| 2445| 4456| mon | 7|\n ```\n \"\"\"\n return pivots.pivot_transpose(self, columns, keep, column_name, value_name, tqdm=tqdm)\n "},{"location":"reference/core/#tablite.core.Table.diff","title":"tablite.core.Table.diff(other, columns=None) ","text":"compares table self with table other PARAMETER DESCRIPTION self Table TYPE: Table other Table TYPE: Table columns list of column names to include in comparison. Defaults to None. TYPE: List DEFAULT: None RETURNS DESCRIPTION Table diff of self and other with diff in columns 1st and 2nd. Source code in tablite/core.py def diff(self, other, columns=None):\n \"\"\"compares table self with table other\n\n Args:\n self (Table): Table\n other (Table): Table\n columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n Returns:\n Table: diff of self and other with diff in columns 1st and 2nd.\n \"\"\"\n return diff.diff(self, other, columns)\n "},{"location":"reference/core/#tablite.core-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core-modules","title":"Modules","text":""},{"location":"reference/datasets/","title":"Datasets","text":""},{"location":"reference/datasets/#tablite.datasets","title":"tablite.datasets ","text":""},{"location":"reference/datasets/#tablite.datasets-classes","title":"Classes","text":""},{"location":"reference/datasets/#tablite.datasets-functions","title":"Functions","text":""},{"location":"reference/datasets/#tablite.datasets.synthetic_order_data","title":"tablite.datasets.synthetic_order_data(rows=100000) ","text":"Creates a synthetic dataset for testing that looks like this: (depending on number of rows) +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n| ~ | # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |\n| row | int | int | datetime | int |int| int |str |str|mixed|mixed| float | float |\n+---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n|0 | 1|1478158906743|2021-10-27 00:00:00|50764| 1|29990|C4-5|APP|21\u00b0 |None | 2.0434376837650046|1.3371665497020444|\n|1 | 2|2271295805011|2021-09-13 00:00:00|50141| 0|10212|C4-5|TAE|None |None | 1.010318612835485| 20.94821610676901|\n|2 | 3|1598726492913|2021-08-19 00:00:00|50527| 0|19416|C3-5|QPV|21\u00b0 |None | 1.463459515469516| 17.4133659842749|\n|3 | 4|1413615572689|2021-11-05 00:00:00|50181| 1|18637|C4-2|GCL|6\u00b0 |ABC | 2.084002469706324| 0.489481411683505|\n|4 | 5| 245266998048|2021-09-25 00:00:00|50378| 0|29756|C5-4|LGY|6\u00b0 |XYZ | 0.5141579343276079| 8.550780816571438|\n|5 | 6| 947994853644|2021-10-14 00:00:00|50511| 0| 7890|C2-4|BET|0\u00b0 |XYZ | 1.1725893606177542| 7.447314130260951|\n|6 | 7|2230693047809|2021-10-07 00:00:00|50987| 1|26742|C1-3|CFP|0\u00b0 |XYZ | 1.0921267279498004|11.009210185311993|\n|... |... |... |... |... |...|... |... |...|... |... |... |... |\n|7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883| 1|15687|C3-1|RFR|None |XYZ | 1.3467185981566827|17.023443485654845|\n|7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152| 0|16556|C4-2|WTC|None |ABC | 1.1517593924478968| 8.201818634721487|\n|7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008| 1|14590|C1-3|WYM|0\u00b0 |None | 2.1273836233717978|23.295943554889195|\n|7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173| 1|20744|C2-1|ZYO|6\u00b0 |ABC | 2.482509134693724| 22.25375464857266|\n|7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052| 1|28298|C5-4|XAW|None |XYZ |0.17923757926558143|23.728160892974252|\n|7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993| 1|24832|C5-2|UDL|None |ABC |0.08425329763360942|12.707735293126758|\n|7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510| 0|15819|C3-4|IGY|None |ABC | 1.066241687256579|13.862069804070295|\n+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n PARAMETER DESCRIPTION rows number of rows wanted. Defaults to 100_000. TYPE: int DEFAULT: 100000 RETURNS DESCRIPTION Table Populated table. TYPE: Table Source code in tablite/datasets.py def synthetic_order_data(rows=100_000):\n \"\"\"Creates a synthetic dataset for testing that looks like this:\n (depending on number of rows)\n\n ```\n +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n | ~ | # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |\n | row | int | int | datetime | int |int| int |str |str|mixed|mixed| float | float |\n +---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n |0 | 1|1478158906743|2021-10-27 00:00:00|50764| 1|29990|C4-5|APP|21\u00b0 |None | 2.0434376837650046|1.3371665497020444|\n |1 | 2|2271295805011|2021-09-13 00:00:00|50141| 0|10212|C4-5|TAE|None |None | 1.010318612835485| 20.94821610676901|\n |2 | 3|1598726492913|2021-08-19 00:00:00|50527| 0|19416|C3-5|QPV|21\u00b0 |None | 1.463459515469516| 17.4133659842749|\n |3 | 4|1413615572689|2021-11-05 00:00:00|50181| 1|18637|C4-2|GCL|6\u00b0 |ABC | 2.084002469706324| 0.489481411683505|\n |4 | 5| 245266998048|2021-09-25 00:00:00|50378| 0|29756|C5-4|LGY|6\u00b0 |XYZ | 0.5141579343276079| 8.550780816571438|\n |5 | 6| 947994853644|2021-10-14 00:00:00|50511| 0| 7890|C2-4|BET|0\u00b0 |XYZ | 1.1725893606177542| 7.447314130260951|\n |6 | 7|2230693047809|2021-10-07 00:00:00|50987| 1|26742|C1-3|CFP|0\u00b0 |XYZ | 1.0921267279498004|11.009210185311993|\n |... |... |... |... |... |...|... |... |...|... |... |... |... |\n |7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883| 1|15687|C3-1|RFR|None |XYZ | 1.3467185981566827|17.023443485654845|\n |7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152| 0|16556|C4-2|WTC|None |ABC | 1.1517593924478968| 8.201818634721487|\n |7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008| 1|14590|C1-3|WYM|0\u00b0 |None | 2.1273836233717978|23.295943554889195|\n |7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173| 1|20744|C2-1|ZYO|6\u00b0 |ABC | 2.482509134693724| 22.25375464857266|\n |7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052| 1|28298|C5-4|XAW|None |XYZ |0.17923757926558143|23.728160892974252|\n |7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993| 1|24832|C5-2|UDL|None |ABC |0.08425329763360942|12.707735293126758|\n |7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510| 0|15819|C3-4|IGY|None |ABC | 1.066241687256579|13.862069804070295|\n +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n ```\n\n Args:\n rows (int, optional): number of rows wanted. Defaults to 100_000.\n\n Returns:\n Table (Table): Populated table.\n \"\"\" # noqa\n rows = int(rows)\n\n L1 = [\"None\", \"0\u00b0\", \"6\u00b0\", \"21\u00b0\"]\n L2 = [\"ABC\", \"XYZ\", \"\"]\n\n t = Table()\n assert isinstance(t, Table)\n for page_n in range(math.ceil(rows / Config.PAGE_SIZE)): # n pages\n start = (page_n * Config.PAGE_SIZE)\n end = min(start + Config.PAGE_SIZE, rows)\n ro = range(start, end)\n\n t2 = Table()\n t2[\"#\"] = [v+1 for v in ro]\n # 1 - mock orderid\n t2[\"1\"] = [random.randint(18_778_628_504, 2277_772_117_504) for i in ro]\n # 2 - mock delivery date.\n t2[\"2\"] = [datetime.fromordinal(random.randint(738000, 738150)).isoformat() for i in ro]\n # 3 - mock store id.\n t2[\"3\"] = [random.randint(50000, 51000) for _ in ro]\n # 4 - random bit.\n t2[\"4\"] = [random.randint(0, 1) for _ in ro]\n # 5 - mock product id\n t2[\"5\"] = [random.randint(3000, 30000) for _ in ro]\n # 6 - random weird string\n t2[\"6\"] = [f\"C{random.randint(1, 5)}-{random.randint(1, 5)}\" for _ in ro]\n # 7 - # random category\n t2[\"7\"] = [\"\".join(random.choice(ascii_uppercase) for _ in range(3)) for _ in ro]\n # 8 -random temperature group.\n t2[\"8\"] = [random.choice(L1) for _ in ro]\n # 9 - random choice of category\n t2[\"9\"] = [random.choice(L2) for _ in ro]\n # 10 - volume?\n t2[\"10\"] = [random.uniform(0.01, 2.5) for _ in ro]\n # 11 - units?\n t2[\"11\"] = [f\"{random.uniform(0.1, 25)}\" for _ in ro]\n\n if len(t) == 0:\n t = t2\n else:\n t += t2\n\n return t\n "},{"location":"reference/datatypes/","title":"Datatypes","text":""},{"location":"reference/datatypes/#tablite.datatypes","title":"tablite.datatypes ","text":""},{"location":"reference/datatypes/#tablite.datatypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.matched_types","title":"tablite.datatypes.matched_types = {int: DataTypes._infer_int, str: DataTypes._infer_str, float: DataTypes._infer_float, bool: DataTypes._infer_bool, date: DataTypes._infer_date, datetime: DataTypes._infer_datetime, time: DataTypes._infer_time} module-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes-classes","title":"Classes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes","title":"tablite.datatypes.DataTypes ","text":" Bases: object DataTypes is the conversion library for all datatypes. It supports any / all python datatypes. "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.int","title":"tablite.datatypes.DataTypes.int = int class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.str","title":"tablite.datatypes.DataTypes.str = str class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.float","title":"tablite.datatypes.DataTypes.float = float class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bool","title":"tablite.datatypes.DataTypes.bool = bool class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date","title":"tablite.datatypes.DataTypes.date = date class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime","title":"tablite.datatypes.DataTypes.datetime = datetime class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.time","title":"tablite.datatypes.DataTypes.time = time class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.timedelta","title":"tablite.datatypes.DataTypes.timedelta = timedelta class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.numeric_types","title":"tablite.datatypes.DataTypes.numeric_types = {int, float, date, time, datetime} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch","title":"tablite.datatypes.DataTypes.epoch = datetime(2000, 1, 1, 0, 0, 0, 0, timezone.utc) class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch_no_tz","title":"tablite.datatypes.DataTypes.epoch_no_tz = datetime(2000, 1, 1, 0, 0, 0, 0) class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.digits","title":"tablite.datatypes.DataTypes.digits = '1234567890' class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.decimals","title":"tablite.datatypes.DataTypes.decimals = set('1234567890-+eE.') class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.integers","title":"tablite.datatypes.DataTypes.integers = set('1234567890-+') class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.nones","title":"tablite.datatypes.DataTypes.nones = {'null', 'Null', 'NULL', '#N/A', '#n/a', '', 'None', None, np.nan} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.none_type","title":"tablite.datatypes.DataTypes.none_type = type(None) class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bytes_functions","title":"tablite.datatypes.DataTypes.bytes_functions = {type(None): b_none, bool: b_bool, int: b_int, float: b_float, str: b_str, bytes: b_bytes, datetime: b_datetime, date: b_date, time: b_time, timedelta: b_timedelta} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code_functions","title":"tablite.datatypes.DataTypes.type_code_functions = {1: _none, 2: _bool, 3: _int, 4: _float, 5: _str, 6: _bytes, 7: _datetime, 8: _date, 9: _time, 10: _timedelta, 11: _unpickle} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pytype_from_type_code","title":"tablite.datatypes.DataTypes.pytype_from_type_code = {1: type(None), 2: bool, 3: int, 4: float, 5: str, 6: bytes, 7: datetime, 8: date, 9: time, 10: timedelta, 11: 'pickled object'} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date_formats","title":"tablite.datatypes.DataTypes.date_formats = {'NNNN-NN-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-NN-N': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-N': lambda x: date(*int(i) for i in x.split('-')), 'NN-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NN-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NNNN.NN.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.NN.N': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.N': lambda x: date(*int(i) for i in x.split('.')), 'NN.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NN.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NNNN/NN/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/NN/N': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/N': lambda x: date(*int(i) for i in x.split('/')), 'NN/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NN/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NNNN NN NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN NN N': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N N': lambda x: date(*int(i) for i in x.split(' ')), 'NN NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NN N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NNNNNNNN': lambda x: date(*(int(x[:4]), int(x[4:6]), int(x[6:])))} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime_formats","title":"tablite.datatypes.DataTypes.datetime_formats = {'NNNN-NN-NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN-NN-NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN/NN/NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN/NN/NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN NN NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN NN NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN.NN.NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NNNN.NN.NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NN-NN-NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN/NN/NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN/NN/NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN NN NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN.NN.NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NNNNNNNNTNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, compact=3)} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.types","title":"tablite.datatypes.DataTypes.types = [datetime, date, time, int, bool, float, str] class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code","title":"tablite.datatypes.DataTypes.type_code(value) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef type_code(cls, value):\n if type(value) in cls._type_codes:\n return cls._type_codes[type(value)]\n elif hasattr(value, \"dtype\"):\n dtype = pytype(value)\n return cls._type_codes[dtype]\n else:\n return cls._type_codes[\"pickle\"]\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_none","title":"tablite.datatypes.DataTypes.b_none(v) ","text":"Source code in tablite/datatypes.py def b_none(v):\n return b\"None\"\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bool","title":"tablite.datatypes.DataTypes.b_bool(v) ","text":"Source code in tablite/datatypes.py def b_bool(v):\n return bytes(str(v), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_int","title":"tablite.datatypes.DataTypes.b_int(v) ","text":"Source code in tablite/datatypes.py def b_int(v):\n return bytes(str(v), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_float","title":"tablite.datatypes.DataTypes.b_float(v) ","text":"Source code in tablite/datatypes.py def b_float(v):\n return bytes(str(v), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_str","title":"tablite.datatypes.DataTypes.b_str(v) ","text":"Source code in tablite/datatypes.py def b_str(v):\n return v.encode(\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bytes","title":"tablite.datatypes.DataTypes.b_bytes(v) ","text":"Source code in tablite/datatypes.py def b_bytes(v):\n return v\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_datetime","title":"tablite.datatypes.DataTypes.b_datetime(v) ","text":"Source code in tablite/datatypes.py def b_datetime(v):\n return bytes(v.isoformat(), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_date","title":"tablite.datatypes.DataTypes.b_date(v) ","text":"Source code in tablite/datatypes.py def b_date(v):\n return bytes(v.isoformat(), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_time","title":"tablite.datatypes.DataTypes.b_time(v) ","text":"Source code in tablite/datatypes.py def b_time(v):\n return bytes(v.isoformat(), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_timedelta","title":"tablite.datatypes.DataTypes.b_timedelta(v) ","text":"Source code in tablite/datatypes.py def b_timedelta(v):\n return bytes(str(float(v.days + (v.seconds / (24 * 60 * 60)))), \"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_pickle","title":"tablite.datatypes.DataTypes.b_pickle(v) ","text":"Source code in tablite/datatypes.py def b_pickle(v):\n return pickle.dumps(v, protocol=0)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_bytes","title":"tablite.datatypes.DataTypes.to_bytes(v) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef to_bytes(cls, v):\n if type(v) in cls.bytes_functions: # it's a python native type\n f = cls.bytes_functions[type(v)]\n elif hasattr(v, \"dtype\"): # it's a numpy/c type.\n dtype = pytype(v)\n f = cls.bytes_functions[dtype]\n else:\n f = cls.b_pickle\n return f(v)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_type_code","title":"tablite.datatypes.DataTypes.from_type_code(value, code) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef from_type_code(cls, value, code):\n f = cls.type_code_functions[code]\n return f(value)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pattern_to_datetime","title":"tablite.datatypes.DataTypes.pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False) staticmethod ","text":"Source code in tablite/datatypes.py @staticmethod\ndef pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False):\n assert isinstance(iso_string, str)\n if compact:\n s = iso_string\n if compact == 1: # has T\n slices = [\n (0, 4, \"-\"),\n (4, 6, \"-\"),\n (6, 8, \"T\"),\n (9, 11, \":\"),\n (11, 13, \":\"),\n (13, len(s), \"\"),\n ]\n elif compact == 2: # has no T.\n slices = [\n (0, 4, \"-\"),\n (4, 6, \"-\"),\n (6, 8, \"T\"),\n (8, 10, \":\"),\n (10, 12, \":\"),\n (12, len(s), \"\"),\n ]\n elif compact == 3: # has T and :\n slices = [\n (0, 4, \"-\"),\n (4, 6, \"-\"),\n (6, 8, \"T\"),\n (9, 11, \":\"),\n (12, 14, \":\"),\n (15, len(s), \"\"),\n ]\n else:\n raise TypeError\n iso_string = \"\".join([s[a:b] + c for a, b, c in slices if b <= len(s)])\n iso_string = iso_string.rstrip(\":\")\n\n if day_first:\n s = iso_string\n iso_string = \"\".join((s[6:10], \"-\", s[3:5], \"-\", s[0:2], s[10:]))\n\n if \",\" in iso_string:\n iso_string = iso_string.replace(\",\", \".\")\n\n dot = iso_string[::-1].find(\".\")\n if 0 < dot < 10:\n ix = len(iso_string) - dot\n microsecond = int(float(f\"0{iso_string[ix - 1:]}\") * 10**6)\n # fmt:off\n iso_string = iso_string[: len(iso_string) - dot] + str(microsecond).rjust(6, \"0\")\n # fmt:on\n if ymd:\n iso_string = iso_string.replace(ymd, \"-\", 2)\n if T:\n iso_string = iso_string.replace(T, \"T\")\n return datetime.fromisoformat(iso_string)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.round","title":"tablite.datatypes.DataTypes.round(value, multiple, up=None) classmethod ","text":"a nicer way to round numbers. PARAMETER DESCRIPTION value value to be rounded TYPE: (float, integer, datetime) multiple value to be used as the based of rounding. 1) multiple = 1 is the same as rounding to whole integers. 2) multiple = 0.001 is the same as rounding to 3 digits precision. 3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415 4) value = datetime(2022,8,18,11,14,53,440) 5) multiple = timedelta(hours=0.5) 6) xround(value,multiple) is datetime(2022,8,18,11,0) TYPE: (float, integer, timedelta) up None (default) or boolean rounds half, up or down. round(1.6, 1) rounds to 2. round(1.4, 1) rounds to 1. round(1.5, 1, up=True) rounds to 2. round(1.5, 1, up=False) rounds to 1. TYPE: (None, bool) DEFAULT: None RETURNS DESCRIPTION float,integer,datetime: rounded value in same type as input. Source code in tablite/datatypes.py @classmethod\ndef round(cls, value, multiple, up=None):\n \"\"\"a nicer way to round numbers.\n\n Args:\n value (float,integer,datetime): value to be rounded\n\n multiple (float,integer,timedelta): value to be used as the based of rounding.\n 1) multiple = 1 is the same as rounding to whole integers.\n 2) multiple = 0.001 is the same as rounding to 3 digits precision.\n 3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415\n 4) value = datetime(2022,8,18,11,14,53,440)\n 5) multiple = timedelta(hours=0.5)\n 6) xround(value,multiple) is datetime(2022,8,18,11,0)\n\n up (None, bool, optional):\n None (default) or boolean rounds half, up or down.\n round(1.6, 1) rounds to 2.\n round(1.4, 1) rounds to 1.\n round(1.5, 1, up=True) rounds to 2.\n round(1.5, 1, up=False) rounds to 1.\n\n Returns:\n float,integer,datetime: rounded value in same type as input.\n \"\"\"\n epoch = 0\n if isinstance(value, (datetime)) and isinstance(multiple, timedelta):\n if value.tzinfo is None:\n epoch = cls.epoch_no_tz\n else:\n epoch = cls.epoch\n\n value2 = value - epoch\n if value2 == 0:\n return value2\n\n low = (value2 // multiple) * multiple\n high = low + multiple\n if up is True:\n return high + epoch\n elif up is False:\n return low + epoch\n else:\n if abs((high + epoch) - value) < abs(value - (low + epoch)):\n return high + epoch\n else:\n return low + epoch\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_json","title":"tablite.datatypes.DataTypes.to_json(v) staticmethod ","text":"converts any python type to json. PARAMETER DESCRIPTION v value to convert to json TYPE: any RETURNS DESCRIPTION json compatible value from v Source code in tablite/datatypes.py @staticmethod\ndef to_json(v):\n \"\"\"converts any python type to json.\n\n Args:\n v (any): value to convert to json\n\n Returns:\n json compatible value from v\n \"\"\"\n if hasattr(v, \"dtype\"):\n v = numpy_to_python(v)\n if v is None:\n return v\n elif v is False:\n # using isinstance(v, bool): won't work as False also is int of zero.\n return str(v)\n elif v is True:\n return str(v)\n elif isinstance(v, int):\n return v\n elif isinstance(v, str):\n return v\n elif isinstance(v, float):\n return v\n elif isinstance(v, datetime):\n return v.isoformat()\n elif isinstance(v, time):\n return v.isoformat()\n elif isinstance(v, date):\n return v.isoformat()\n elif isinstance(v, timedelta):\n return f\"P{v.days}DT{v.seconds + (v.microseconds / 1e6)}S\"\n else:\n raise TypeError(f\"The datatype {type(v)} is not supported.\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_json","title":"tablite.datatypes.DataTypes.from_json(v, dtype) staticmethod ","text":"converts json to python datatype PARAMETER DESCRIPTION v value TYPE: any dtype any python type TYPE: python type RETURNS DESCRIPTION python type of value v Source code in tablite/datatypes.py @staticmethod\ndef from_json(v, dtype):\n \"\"\"converts json to python datatype\n\n Args:\n v (any): value\n dtype (python type): any python type\n\n Returns:\n python type of value v\n \"\"\"\n if v in DataTypes.nones:\n if dtype is str and v == \"\":\n return \"\"\n else:\n return None\n if dtype is int:\n return int(v)\n elif dtype is str:\n return str(v)\n elif dtype is float:\n return float(v)\n elif dtype is bool:\n if v == \"False\":\n return False\n elif v == \"True\":\n return True\n else:\n raise ValueError(v)\n elif dtype is date:\n return date.fromisoformat(v)\n elif dtype is datetime:\n return datetime.fromisoformat(v)\n elif dtype is time:\n return time.fromisoformat(v)\n elif dtype is timedelta:\n L = v.split(\"DT\")\n days = int(L[0].lstrip(\"P\"))\n seconds = float(L[1].rstrip(\"S\"))\n return timedelta(days, seconds)\n else:\n raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess_types","title":"tablite.datatypes.DataTypes.guess_types(*values) staticmethod ","text":"Attempts to guess the datatype for *values returns dict with matching datatypes and probabilities RETURNS DESCRIPTION dict {key: type, value: probability} Source code in tablite/datatypes.py @staticmethod\ndef guess_types(*values):\n \"\"\"Attempts to guess the datatype for *values\n returns dict with matching datatypes and probabilities\n\n Returns:\n dict: {key: type, value: probability}\n \"\"\"\n d = defaultdict(int)\n probability = Rank(DataTypes.types[:])\n\n for value in values:\n if hasattr(value, \"dtype\"):\n value = numpy_to_python(value)\n\n for dtype in probability:\n try:\n _ = DataTypes.infer(value, dtype)\n d[dtype] += 1\n probability.match(dtype)\n break\n except (ValueError, TypeError):\n pass\n if not d:\n d[str] = len(values)\n return {k: round(v / len(values), 3) for k, v in d.items()}\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess","title":"tablite.datatypes.DataTypes.guess(*values) staticmethod ","text":"Makes a best guess the datatype for *values returns list of native python values RETURNS DESCRIPTION list list of native python values Source code in tablite/datatypes.py @staticmethod\ndef guess(*values):\n \"\"\"Makes a best guess the datatype for *values\n returns list of native python values\n\n Returns:\n list: list of native python values\n \"\"\"\n probability = Rank(*DataTypes.types[:])\n matches = [None for _ in values[0]]\n\n for ix, value in enumerate(values[0]):\n if hasattr(value, \"dtype\"):\n value = numpy_to_python(value)\n for dtype in probability:\n try:\n matches[ix] = DataTypes.infer(value, dtype)\n probability.match(dtype)\n break\n except (ValueError, TypeError):\n pass\n return matches\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.infer","title":"tablite.datatypes.DataTypes.infer(v, dtype) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef infer(cls, v, dtype):\n if isinstance(v, str) and dtype == str:\n # we got a string, we're trying to infer it to string, we shouldn't check for None-ness\n return v\n\n if v in DataTypes.nones:\n return None\n\n if dtype not in matched_types:\n raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n\n return matched_types[dtype](v)\n "},{"location":"reference/datatypes/#tablite.datatypes.Rank","title":"tablite.datatypes.Rank(*items) ","text":" Bases: object Source code in tablite/datatypes.py def __init__(self, *items):\n self.items = {i: ix for i, ix in zip(items, range(len(items)))}\n self.ranks = [0 for _ in items]\n self.items_list = [i for i in items]\n "},{"location":"reference/datatypes/#tablite.datatypes.Rank-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items","title":"tablite.datatypes.Rank.items = {i: ixfor (i, ix) in zip(items, range(len(items)))} instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.ranks","title":"tablite.datatypes.Rank.ranks = [0 for _ in items] instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items_list","title":"tablite.datatypes.Rank.items_list = [i for i in items] instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.match","title":"tablite.datatypes.Rank.match(k) ","text":"Source code in tablite/datatypes.py def match(self, k): # k+=1\n ix = self.items[k]\n r = self.ranks\n r[ix] += 1\n\n if ix > 0:\n p = self.items_list\n while (\n r[ix] > r[ix - 1] and ix > 0\n ): # use a simple bubble sort to maintain rank\n r[ix], r[ix - 1] = r[ix - 1], r[ix]\n p[ix], p[ix - 1] = p[ix - 1], p[ix]\n old = p[ix]\n self.items[old] = ix\n self.items[k] = ix - 1\n ix -= 1\n "},{"location":"reference/datatypes/#tablite.datatypes.Rank.__iter__","title":"tablite.datatypes.Rank.__iter__() ","text":"Source code in tablite/datatypes.py def __iter__(self):\n return iter(self.items_list)\n "},{"location":"reference/datatypes/#tablite.datatypes.MetaArray","title":"tablite.datatypes.MetaArray ","text":" Bases: ndarray Array with metadata. "},{"location":"reference/datatypes/#tablite.datatypes.MetaArray-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__new__","title":"tablite.datatypes.MetaArray.__new__(array, dtype=None, order=None, **kwargs) ","text":"Source code in tablite/datatypes.py def __new__(cls, array, dtype=None, order=None, **kwargs):\n obj = np.asarray(array, dtype=dtype, order=order).view(cls)\n obj.metadata = kwargs\n return obj\n "},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__array_finalize__","title":"tablite.datatypes.MetaArray.__array_finalize__(obj) ","text":"Source code in tablite/datatypes.py def __array_finalize__(self, obj):\n if obj is None:\n return\n self.metadata = getattr(obj, \"metadata\", None)\n "},{"location":"reference/datatypes/#tablite.datatypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.numpy_to_python","title":"tablite.datatypes.numpy_to_python(obj: Any) -> Any ","text":"Converts numpy types to python types. See https://numpy.org/doc/stable/reference/arrays.scalars.html PARAMETER DESCRIPTION obj A numpy object TYPE: Any RETURNS DESCRIPTION Any python object: A python object Source code in tablite/datatypes.py def numpy_to_python(obj: Any) -> Any:\n \"\"\"Converts numpy types to python types.\n\n See https://numpy.org/doc/stable/reference/arrays.scalars.html\n\n Args:\n obj (Any): A numpy object\n\n Returns:\n python object: A python object\n \"\"\"\n if isinstance(obj, np.generic):\n return obj.item()\n return obj\n "},{"location":"reference/datatypes/#tablite.datatypes.pytype","title":"tablite.datatypes.pytype(obj) ","text":"Returns the python type of any object PARAMETER DESCRIPTION obj any numpy or python object TYPE: Any RETURNS DESCRIPTION type type of obj Source code in tablite/datatypes.py def pytype(obj):\n \"\"\"Returns the python type of any object\n\n Args:\n obj (Any): any numpy or python object\n\n Returns:\n type: type of obj\n \"\"\"\n if isinstance(obj, np.generic):\n return type(obj.item())\n return type(obj)\n "},{"location":"reference/datatypes/#tablite.datatypes.pytype_from_iterable","title":"tablite.datatypes.pytype_from_iterable(iterable: {tuple, list}) -> {np.dtype, dict} ","text":"helper to make correct np array from python types. PARAMETER DESCRIPTION iterable values to be converted to numpy array. TYPE: (tuple, list) RAISES DESCRIPTION NotImplementedError if datatype is not supported. RETURNS DESCRIPTION {dtype, dict} np.dtype: python type of the iterable. Source code in tablite/datatypes.py def pytype_from_iterable(iterable: {tuple, list}) -> {np.dtype, dict}:\n \"\"\"helper to make correct np array from python types.\n\n Args:\n iterable (tuple,list): values to be converted to numpy array.\n\n Raises:\n NotImplementedError: if datatype is not supported.\n\n Returns:\n np.dtype: python type of the iterable.\n \"\"\"\n py_types = {}\n if isinstance(iterable, (tuple, list)):\n type_counter = Counter((pytype(v) for v in iterable))\n\n for k, v in type_counter.items():\n py_types[k] = v\n\n if len(py_types) == 0:\n np_dtype, py_dtype = object, bool\n elif len(py_types) == 1:\n py_dtype = list(py_types.keys())[0]\n if py_dtype == datetime:\n np_dtype = np.datetime64\n elif py_dtype == date:\n np_dtype = np.datetime64\n elif py_dtype == timedelta:\n np_dtype = np.timedelta64\n else:\n np_dtype = None\n else:\n np_dtype = object\n elif isinstance(iterable, np.ndarray):\n if iterable.dtype == object:\n np_dtype = object\n py_types = dict(Counter((pytype(v) for v in iterable)))\n else:\n np_dtype = iterable.dtype\n if len(iterable) > 0:\n py_types = {pytype(iterable[0]): len(iterable)}\n else:\n py_types = {pytype(np_dtype.type()): len(iterable)}\n else:\n raise NotImplementedError(f\"No handler for {type(iterable)}\")\n\n return np_dtype, py_types\n "},{"location":"reference/datatypes/#tablite.datatypes.list_to_np_array","title":"tablite.datatypes.list_to_np_array(iterable) ","text":"helper to make correct np array from python types. Example of problem where numpy turns mixed types into strings. np.array([4, '5']) np.ndarray(['4', '5']) RETURNS DESCRIPTION np.array datatypes Source code in tablite/datatypes.py def list_to_np_array(iterable):\n \"\"\"helper to make correct np array from python types.\n Example of problem where numpy turns mixed types into strings.\n >>> np.array([4, '5'])\n np.ndarray(['4', '5'])\n\n returns:\n np.array\n datatypes\n \"\"\"\n np_dtype, py_dtype = pytype_from_iterable(iterable)\n\n value = MetaArray(iterable, dtype=np_dtype, py_dtype=py_dtype)\n return value\n "},{"location":"reference/datatypes/#tablite.datatypes.np_type_unify","title":"tablite.datatypes.np_type_unify(arrays) ","text":"unifies numpy types. PARAMETER DESCRIPTION arrays List of numpy arrays TYPE: list RETURNS DESCRIPTION np.ndarray: numpy array of a single type. Source code in tablite/datatypes.py def np_type_unify(arrays):\n \"\"\"unifies numpy types.\n\n Args:\n arrays (list): List of numpy arrays\n\n Returns:\n np.ndarray: numpy array of a single type.\n \"\"\"\n dtypes = {arr.dtype: len(arr) for arr in arrays}\n if len(dtypes) == 1:\n dtype, _ = dtypes.popitem()\n else:\n for ix, arr in enumerate(arrays):\n arrays[ix] = np.array(arr, dtype=object)\n dtype = object\n return np.concatenate(arrays, dtype=dtype)\n "},{"location":"reference/datatypes/#tablite.datatypes.multitype_set","title":"tablite.datatypes.multitype_set(arr) ","text":"prevents loss of True, False when calling sets. python looses values when called returning a set. Example: {1, True, 0, False} PARAMETER DESCRIPTION arr iterable of mixed types. TYPE: Iterable RETURNS DESCRIPTION np.array: with unique values. Source code in tablite/datatypes.py def multitype_set(arr):\n \"\"\"prevents loss of True, False when calling sets.\n\n python looses values when called returning a set. Example:\n >>> {1, True, 0, False}\n {0,1}\n\n Args:\n arr (Iterable): iterable of mixed types.\n\n Returns:\n np.array: with unique values.\n \"\"\"\n L = [(type(v), v) for v in arr]\n L = list(set(L))\n L = [v for _, v in L]\n return np.array(L, dtype=object)\n "},{"location":"reference/diff/","title":"Diff","text":""},{"location":"reference/diff/#tablite.diff","title":"tablite.diff ","text":""},{"location":"reference/diff/#tablite.diff-classes","title":"Classes","text":""},{"location":"reference/diff/#tablite.diff-functions","title":"Functions","text":""},{"location":"reference/diff/#tablite.diff.diff","title":"tablite.diff.diff(T, other, columns=None) ","text":"compares table self with table other PARAMETER DESCRIPTION self Table TYPE: Table other Table TYPE: Table columns list of column names to include in comparison. Defaults to None. TYPE: List DEFAULT: None RETURNS DESCRIPTION Table diff of self and other with diff in columns 1st and 2nd. Source code in tablite/diff.py def diff(T, other, columns=None):\n \"\"\"compares table self with table other\n\n Args:\n self (Table): Table\n other (Table): Table\n columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n Returns:\n Table: diff of self and other with diff in columns 1st and 2nd.\n \"\"\"\n sub_cls_check(T, BaseTable)\n sub_cls_check(other, BaseTable)\n if columns is None:\n columns = [name for name in T.columns if name in other.columns]\n elif isinstance(columns, list) and all(isinstance(i, str) for i in columns):\n for name in columns:\n if name not in T.columns:\n raise ValueError(f\"column '{name}' not found\")\n if name not in other.columns:\n raise ValueError(f\"column '{name}' not found\")\n else:\n raise TypeError(\"Expected list of column names\")\n\n t1 = T[columns]\n if issubclass(type(t1), BaseTable):\n t1 = [tuple(r) for r in T.rows]\n else:\n t1 = list(T)\n t2 = other[columns]\n if issubclass(type(t2), BaseTable):\n t2 = [tuple(r) for r in other.rows]\n else:\n t2 = list(other)\n\n sm = difflib.SequenceMatcher(None, t1, t2)\n new = type(T)()\n first = unique_name(\"1st\", columns)\n second = unique_name(\"2nd\", columns)\n new.add_columns(*columns + [first, second])\n\n news = {n: [] for n in new.columns} # Cache for Work in progress.\n\n for opc, t1a, t1b, t2a, t2b in sm.get_opcodes():\n if opc == \"insert\":\n for name, col in zip(columns, zip(*t2[t2a:t2b])):\n news[name].extend(col)\n news[first] += [\"-\"] * (t2b - t2a)\n news[second] += [\"+\"] * (t2b - t2a)\n\n elif opc == \"delete\":\n for name, col in zip(columns, zip(*t1[t1a:t1b])):\n news[name].extend(col)\n news[first] += [\"+\"] * (t1b - t1a)\n news[second] += [\"-\"] * (t1b - t1a)\n\n elif opc == \"equal\":\n for name, col in zip(columns, zip(*t2[t2a:t2b])):\n news[name].extend(col)\n news[first] += [\"=\"] * (t2b - t2a)\n news[second] += [\"=\"] * (t2b - t2a)\n\n elif opc == \"replace\":\n for name, col in zip(columns, zip(*t2[t2a:t2b])):\n news[name].extend(col)\n news[first] += [\"r\"] * (t2b - t2a)\n news[second] += [\"r\"] * (t2b - t2a)\n\n else:\n pass\n\n # Clear cache to free up memory.\n if len(news[first]) > Config.PAGE_SIZE == 0:\n for name, L in news.items():\n new[name].extend(np.array(L))\n L.clear()\n\n for name, L in news.items():\n new[name].extend(np.array(L))\n L.clear()\n return new\n "},{"location":"reference/export_utils/","title":"Export utils","text":""},{"location":"reference/export_utils/#tablite.export_utils","title":"tablite.export_utils ","text":""},{"location":"reference/export_utils/#tablite.export_utils-classes","title":"Classes","text":""},{"location":"reference/export_utils/#tablite.export_utils-functions","title":"Functions","text":""},{"location":"reference/export_utils/#tablite.export_utils.to_sql","title":"tablite.export_utils.to_sql(table, name) ","text":"generates ANSI-92 compliant SQL. PARAMETER DESCRIPTION name name of SQL table. TYPE: str Source code in tablite/export_utils.py def to_sql(table, name):\n \"\"\"\n generates ANSI-92 compliant SQL.\n\n args:\n name (str): name of SQL table.\n \"\"\"\n sub_cls_check(table, BaseTable)\n type_check(name, str)\n\n prefix = name\n name = \"T1\"\n create_table = \"\"\"CREATE TABLE {} ({})\"\"\"\n columns = []\n for name, col in table.columns.items():\n dtype = col.types()\n if len(dtype) == 1:\n dtype, _ = dtype.popitem()\n if dtype is int:\n dtype = \"INTEGER\"\n elif dtype is float:\n dtype = \"REAL\"\n else:\n dtype = \"TEXT\"\n else:\n dtype = \"TEXT\"\n definition = f\"{name} {dtype}\"\n columns.append(definition)\n\n create_table = create_table.format(prefix, \", \".join(columns))\n\n # return create_table\n row_inserts = []\n for row in table.rows:\n row_inserts.append(str(tuple([i if i is not None else \"NULL\" for i in row])))\n row_inserts = f\"INSERT INTO {prefix} VALUES \" + \",\".join(row_inserts)\n return \"begin; {}; {}; commit;\".format(create_table, row_inserts)\n "},{"location":"reference/export_utils/#tablite.export_utils.to_pandas","title":"tablite.export_utils.to_pandas(table) ","text":"returns pandas.DataFrame Source code in tablite/export_utils.py def to_pandas(table):\n \"\"\"\n returns pandas.DataFrame\n \"\"\"\n sub_cls_check(table, BaseTable)\n try:\n return pd.DataFrame(table.to_dict()) # noqa\n except ImportError:\n import pandas as pd # noqa\n return pd.DataFrame(table.to_dict()) # noqa\n "},{"location":"reference/export_utils/#tablite.export_utils.to_hdf5","title":"tablite.export_utils.to_hdf5(table, path) ","text":"creates a copy of the table as hdf5 Note that some loss of type information is to be expected in columns of mixed type: t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ Source code in tablite/export_utils.py def to_hdf5(table, path):\n # fmt: off\n \"\"\"\n creates a copy of the table as hdf5\n\n Note that some loss of type information is to be expected in columns of mixed type:\n >>> t.show(dtype=True)\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int|\n +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n >>> t.to_hdf5(filename)\n >>> t2 = Table.from_hdf5(filename)\n >>> t2.show(dtype=True)\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int|\n +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n \"\"\"\n # fmt: in\n import h5py\n\n sub_cls_check(table, BaseTable)\n type_check(path, Path)\n\n total = f\"{len(table.columns) * len(table):,}\" # noqa\n print(f\"writing {total} records to {path}\", end=\"\")\n\n with h5py.File(path, \"w\") as f:\n n = 0\n for name, col in table.items():\n try:\n f.create_dataset(name, data=col[:]) # stored in hdf5 as '/name'\n except TypeError:\n f.create_dataset(name, data=[str(i) for i in col[:]]) # stored in hdf5 as '/name'\n n += 1\n print(\"... done\")\n "},{"location":"reference/export_utils/#tablite.export_utils.excel_writer","title":"tablite.export_utils.excel_writer(table, path) ","text":"writer for excel files. This can create xlsx files beyond Excels. If you're using pyexcel to read the data, you'll see the data is there. If you're using Excel, Excel will stop loading after 1,048,576 rows. See pyexcel for more details: http://docs.pyexcel.org/ Source code in tablite/export_utils.py def excel_writer(table, path):\n \"\"\"\n writer for excel files.\n\n This can create xlsx files beyond Excels.\n If you're using pyexcel to read the data, you'll see the data is there.\n If you're using Excel, Excel will stop loading after 1,048,576 rows.\n\n See pyexcel for more details:\n http://docs.pyexcel.org/\n \"\"\"\n import pyexcel\n\n sub_cls_check(table, BaseTable)\n type_check(path, Path)\n\n def gen(table): # local helper\n yield table.columns\n for row in table.rows:\n yield row\n\n data = list(gen(table))\n if path.suffix in [\".xls\", \".ods\"]:\n data = [\n [str(v) if (isinstance(v, (int, float)) and abs(v) > 2**32 - 1) else DataTypes.to_json(v) for v in row]\n for row in data\n ]\n\n pyexcel.save_as(array=data, dest_file_name=str(path))\n "},{"location":"reference/export_utils/#tablite.export_utils.to_json","title":"tablite.export_utils.to_json(table, *args, **kwargs) ","text":"Source code in tablite/export_utils.py def to_json(table, *args, **kwargs):\n import json\n\n sub_cls_check(table, BaseTable)\n return json.dumps(table.as_json_serializable())\n "},{"location":"reference/export_utils/#tablite.export_utils.path_suffix_check","title":"tablite.export_utils.path_suffix_check(path, kind) ","text":"Source code in tablite/export_utils.py def path_suffix_check(path, kind):\n if not path.suffix == kind:\n raise ValueError(f\"Suffix mismatch: Expected {kind}, got {path.suffix} in {path.name}\")\n if not path.parent.exists():\n raise FileNotFoundError(f\"directory {path.parent} not found.\")\n "},{"location":"reference/export_utils/#tablite.export_utils.text_writer","title":"tablite.export_utils.text_writer(table, path, tqdm=_tqdm) ","text":"exports table to csv, tsv or txt dependening on path suffix. follows the JSON norm. text escape is ON for all strings. "},{"location":"reference/export_utils/#tablite.export_utils.text_writer--note","title":"Note:","text":"If the delimiter is present in a string when the string is exported, text-escape is required, as the format otherwise is corrupted. When the file is being written, it is unknown whether any string in a column contrains the delimiter. As text escaping the few strings that may contain the delimiter would lead to an assymmetric format, the safer guess is to text escape all strings. Source code in tablite/export_utils.py def text_writer(table, path, tqdm=_tqdm):\n \"\"\"exports table to csv, tsv or txt dependening on path suffix.\n follows the JSON norm. text escape is ON for all strings.\n\n Note:\n ----------------------\n If the delimiter is present in a string when the string is exported,\n text-escape is required, as the format otherwise is corrupted.\n When the file is being written, it is unknown whether any string in\n a column contrains the delimiter. As text escaping the few strings\n that may contain the delimiter would lead to an assymmetric format,\n the safer guess is to text escape all strings.\n \"\"\"\n sub_cls_check(table, BaseTable)\n type_check(path, Path)\n\n def txt(value): # helper for text writer\n if value is None:\n return \"\" # A column with 1,None,2 must be \"1,,2\".\n elif isinstance(value, str):\n # if not (value.startswith('\"') and value.endswith('\"')):\n # return f'\"{value}\"' # this must be escape: \"the quick fox, jumped over the comma\"\n # else:\n return value # this would for example be an empty string: \"\"\n else:\n return str(DataTypes.to_json(value)) # this handles datetimes, timedelta, etc.\n\n delimiters = {\".csv\": \",\", \".tsv\": \"\\t\", \".txt\": \"|\"}\n delimiter = delimiters.get(path.suffix)\n\n with path.open(\"w\", encoding=\"utf-8\") as fo:\n fo.write(delimiter.join(c for c in table.columns) + \"\\n\")\n for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE):\n fo.write(delimiter.join(txt(c) for c in row) + \"\\n\")\n "},{"location":"reference/export_utils/#tablite.export_utils.sql_writer","title":"tablite.export_utils.sql_writer(table, path) ","text":"Source code in tablite/export_utils.py def sql_writer(table, path):\n type_check(table, BaseTable)\n type_check(path, Path)\n with path.open(\"w\", encoding=\"utf-8\") as fo:\n fo.write(to_sql(table))\n "},{"location":"reference/export_utils/#tablite.export_utils.json_writer","title":"tablite.export_utils.json_writer(table, path) ","text":"Source code in tablite/export_utils.py def json_writer(table, path):\n type_check(table, BaseTable)\n type_check(path, Path)\n with path.open(\"w\") as fo:\n fo.write(to_json(table))\n "},{"location":"reference/export_utils/#tablite.export_utils.to_html","title":"tablite.export_utils.to_html(table, path) ","text":"Source code in tablite/export_utils.py def to_html(table, path):\n type_check(table, BaseTable)\n type_check(path, Path)\n with path.open(\"w\", encoding=\"utf-8\") as fo:\n fo.write(table._repr_html_(slice(0, len(table))))\n "},{"location":"reference/file_reader_utils/","title":"File reader utils","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils","title":"tablite.file_reader_utils ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ENCODING_GUESS_BYTES","title":"tablite.file_reader_utils.ENCODING_GUESS_BYTES = 10000 module-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.header_readers","title":"tablite.file_reader_utils.header_readers = {'fods': excel_reader_headers, 'json': excel_reader_headers, 'simple': excel_reader_headers, 'rst': excel_reader_headers, 'mediawiki': excel_reader_headers, 'xlsx': excel_reader_headers, 'xlsm': excel_reader_headers, 'csv': text_reader_headers, 'tsv': text_reader_headers, 'txt': text_reader_headers, 'ods': ods_reader_headers} module-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-classes","title":"Classes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape","title":"tablite.file_reader_utils.TextEscape(openings='({[', closures=']})', text_qualifier='\"', delimiter=',', strip_leading_and_tailing_whitespace=False) ","text":" Bases: object enables parsing of CSV with respecting brackets and text marks. Example: text_escape = TextEscape() # set up the instance. for line in somefile.readlines(): list_of_words = text_escape(line) # use the instance. ... As an example, the Danes and Germans use \" for inches and ' for feet, so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so for this case ( and ) are valid escapes, but \" and ' aren't. Source code in tablite/file_reader_utils.py def __init__(\n self,\n openings=\"({[\",\n closures=\"]})\",\n text_qualifier='\"',\n delimiter=\",\",\n strip_leading_and_tailing_whitespace=False,\n):\n \"\"\"\n As an example, the Danes and Germans use \" for inches and ' for feet,\n so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so\n for this case ( and ) are valid escapes, but \" and ' aren't.\n\n \"\"\"\n if openings is None:\n openings = [None]\n elif isinstance(openings, str):\n self.openings = {c for c in openings}\n else:\n raise TypeError(f\"expected str, got {type(openings)}\")\n\n if closures is None:\n closures = [None]\n elif isinstance(closures, str):\n self.closures = {c for c in closures}\n else:\n raise TypeError(f\"expected str, got {type(closures)}\")\n\n if not isinstance(delimiter, str):\n raise TypeError(f\"expected str, got {type(delimiter)}\")\n self.delimiter = delimiter\n self._delimiter_length = len(delimiter)\n self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n\n if text_qualifier is None:\n pass\n elif text_qualifier in openings + closures:\n raise ValueError(\"It's a bad idea to have qoute character appears in openings or closures.\")\n else:\n self.qoute = text_qualifier\n\n if not text_qualifier:\n if not self.strip_leading_and_tailing_whitespace:\n self.c = self._call_1\n else:\n self.c = self._call_2\n else:\n self.c = self._call_3\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.openings","title":"tablite.file_reader_utils.TextEscape.openings = {c for c in openings} instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.closures","title":"tablite.file_reader_utils.TextEscape.closures = {c for c in closures} instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.delimiter","title":"tablite.file_reader_utils.TextEscape.delimiter = delimiter instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace","title":"tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.qoute","title":"tablite.file_reader_utils.TextEscape.qoute = text_qualifier instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.c","title":"tablite.file_reader_utils.TextEscape.c = self._call_1 instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.__call__","title":"tablite.file_reader_utils.TextEscape.__call__(s) ","text":"Source code in tablite/file_reader_utils.py def __call__(self, s):\n return self.c(s)\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.split_by_sequence","title":"tablite.file_reader_utils.split_by_sequence(text, sequence) ","text":"helper to split text according to a split sequence. Source code in tablite/file_reader_utils.py def split_by_sequence(text, sequence):\n \"\"\"helper to split text according to a split sequence.\"\"\"\n chunks = tuple()\n for element in sequence:\n idx = text.find(element)\n if idx < 0:\n raise ValueError(f\"'{element}' not in row\")\n chunk, text = text[:idx], text[len(element) + idx :]\n chunks += (chunk,)\n chunks += (text,) # the remaining text.\n return chunks\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.detect_seperator","title":"tablite.file_reader_utils.detect_seperator(text) ","text":":param path: pathlib.Path objects :param encoding: file encoding. :return: 1 character. Source code in tablite/file_reader_utils.py def detect_seperator(text):\n \"\"\"\n :param path: pathlib.Path objects\n :param encoding: file encoding.\n :return: 1 character.\n \"\"\"\n # After reviewing the logic in the CSV sniffer, I concluded that all it\n # really does is to look for a non-text character. As the separator is\n # determined by the first line, which almost always is a line of headers,\n # the text characters will be utf-8,16 or ascii letters plus white space.\n # This leaves the characters ,;:| and \\t as potential separators, with one\n # exception: files that use whitespace as separator. My logic is therefore\n # to (1) find the set of characters that intersect with ',;:|\\t' which in\n # practice is a single character, unless (2) it is empty whereby it must\n # be whitespace.\n if len(text) == 0:\n return None\n seps = {\",\", \"\\t\", \";\", \":\", \"|\"}.intersection(text)\n if not seps:\n if \" \" in text:\n return \" \"\n if \"\\n\" in text:\n return \"\\n\"\n else:\n raise ValueError(\"separator not detected\")\n if len(seps) == 1:\n return seps.pop()\n else:\n frq = [(text.count(i), i) for i in seps]\n frq.sort(reverse=True) # most frequent first.\n return frq[0][-1]\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.text_reader_headers","title":"tablite.file_reader_utils.text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount) ","text":"Source code in tablite/file_reader_utils.py def text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n d = {}\n delimiters = {\n \".csv\": \",\",\n \".tsv\": \"\\t\",\n \".txt\": None,\n }\n\n try:\n with path.open(\"rb\") as fi:\n rawdata = fi.read(ENCODING_GUESS_BYTES)\n encoding = chardet.detect(rawdata)[\"encoding\"]\n\n if delimiter is None:\n with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n lines = []\n for n, line in enumerate(fi, -header_row_index):\n if n < 0:\n continue\n line = line.rstrip(\"\\n\")\n lines.append(line)\n if n >= linecount:\n break # break on first\n try:\n d[\"delimiter\"] = delimiter = detect_seperator(\"\\n\".join(lines))\n except ValueError as e:\n if e.args == (\"separator not detected\", ):\n d[\"delimiter\"] = delimiter = None # this will handle the case of 1 column, 1 row\n else:\n raise e\n\n if delimiter is None:\n d[\"delimiter\"] = delimiter = delimiters[path.suffix] # pickup the default one\n d[path.name] = [lines]\n d[\"is_empty\"] = True # mark as empty to return an empty table instead of throwing\n else:\n kwargs = {}\n\n if text_qualifier is not None:\n kwargs[\"text_qualifier\"] = text_qualifier\n kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n else:\n kwargs[\"quoting\"] = \"QUOTE_NONE\"\n\n d[path.name] = _get_headers(\n str(path), py_to_nim_encoding(encoding), header_row_index=header_row_index,\n delimiter=delimiter,\n linecount=linecount,\n **kwargs\n )\n return d\n except Exception as e:\n raise ValueError(f\"can't read {path.suffix}\")\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.excel_reader_headers","title":"tablite.file_reader_utils.excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount) ","text":"Source code in tablite/file_reader_utils.py def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n d = {}\n book = openpyxl.open(str(path), read_only=True)\n\n try:\n all_sheets = book.sheetnames\n\n for sheet_name, sheet in ((name, book[name]) for name in all_sheets):\n fixup_worksheet(sheet)\n if sheet.max_row is None:\n max_rows = 0\n else:\n max_rows = min(sheet.max_row, linecount + 1)\n container = [None] * max_rows\n padding_ends = 0\n max_column = sheet.max_column\n\n for i, row_data in enumerate(sheet.iter_rows(0, header_row_index + max_rows, values_only=True), start=-header_row_index):\n if i < 0:\n # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row\n continue\n\n # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string\n container[i] = [DataTypes.to_json(v) for v in row_data]\n\n for j, cell in enumerate(reversed(row_data)):\n if cell is None:\n continue\n\n padding_ends = max(padding_ends, max_column - j)\n\n break\n\n d[sheet_name] = [None if c is None else c[0:padding_ends] for c in container]\n d[\"delimiter\"] = None\n finally:\n book.close()\n\n return d\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ods_reader_headers","title":"tablite.file_reader_utils.ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount) ","text":"Source code in tablite/file_reader_utils.py def ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n d = {\n \"delimiter\": None\n }\n sheets = pyexcel.get_book_dict(file_name=str(path))\n\n for sheet_name, data in sheets.items():\n lines = [[DataTypes.to_json(v) for v in row] for row in data[header_row_index:header_row_index+linecount]]\n\n d[sheet_name] = lines\n\n return d\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_headers","title":"tablite.file_reader_utils.get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10) ","text":"file format definition csv comma separated values tsv tab separated values csvz a zip file that contains one or many csv files tsvz a zip file that contains one or many tsv files xls a spreadsheet file format created by MS-Excel 97-2003 xlsx MS-Excel Extensions to the Office Open XML SpreadsheetML File Format. xlsm an MS-Excel Macro-Enabled Workbook file ods open document spreadsheet fods flat open document spreadsheet json java script object notation html html table of the data structure simple simple presentation rst rStructured Text presentation of the data mediawiki media wiki table Source code in tablite/file_reader_utils.py def get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10):\n \"\"\"\n file format\tdefinition\n csv\t comma separated values\n tsv\t tab separated values\n csvz\ta zip file that contains one or many csv files\n tsvz\ta zip file that contains one or many tsv files\n xls\t a spreadsheet file format created by MS-Excel 97-2003\n xlsx\tMS-Excel Extensions to the Office Open XML SpreadsheetML File Format.\n xlsm\tan MS-Excel Macro-Enabled Workbook file\n ods\t open document spreadsheet\n fods\tflat open document spreadsheet\n json\tjava script object notation\n html\thtml table of the data structure\n simple\tsimple presentation\n rst\t rStructured Text presentation of the data\n mediawiki\tmedia wiki table\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n if not isinstance(path, Path):\n raise TypeError(\"expected pathlib path.\")\n if not path.exists():\n raise FileNotFoundError(str(path))\n if delimiter is not None:\n if not isinstance(delimiter, str):\n raise TypeError(f\"expected str or None, not {type(delimiter)}\")\n\n kwargs = {\n \"path\": path,\n \"delimiter\": delimiter,\n \"header_row_index\": header_row_index,\n \"text_qualifier\": text_qualifier,\n \"linecount\": linecount\n }\n\n reader = header_readers.get(path.suffix[1:], None)\n\n if reader is None:\n raise TypeError(f\"file format for headers not supported: {path.suffix}\")\n\n result = reader(**kwargs)\n\n return result\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_encoding","title":"tablite.file_reader_utils.get_encoding(path, nbytes=ENCODING_GUESS_BYTES) ","text":"Source code in tablite/file_reader_utils.py def get_encoding(path, nbytes=ENCODING_GUESS_BYTES):\n nbytes = min(nbytes, path.stat().st_size)\n with path.open(\"rb\") as fi:\n rawdata = fi.read(nbytes)\n encoding = chardet.detect(rawdata)[\"encoding\"]\n if encoding == \"ascii\": # utf-8 is backwards compatible with ascii\n return \"utf-8\" # -- so should the first 10k chars not be enough,\n return encoding # -- the utf-8 encoding will still get it right.\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_delimiter","title":"tablite.file_reader_utils.get_delimiter(path, encoding) ","text":"Source code in tablite/file_reader_utils.py def get_delimiter(path, encoding):\n with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n lines = []\n for n, line in enumerate(fi):\n line = line.rstrip(\"\\n\")\n lines.append(line)\n if n > 10:\n break # break on first\n delimiter = detect_seperator(\"\\n\".join(lines))\n if delimiter is None:\n raise ValueError(\"Delimiter could not be determined\")\n return delimiter\n "},{"location":"reference/groupby_utils/","title":"Groupby utils","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils","title":"tablite.groupby_utils ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils-classes","title":"Classes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy","title":"tablite.groupby_utils.GroupBy ","text":" Bases: object "},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy-attributes","title":"Attributes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.max","title":"tablite.groupby_utils.GroupBy.max = 'Max' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.min","title":"tablite.groupby_utils.GroupBy.min = 'Min' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.sum","title":"tablite.groupby_utils.GroupBy.sum = 'Sum' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.product","title":"tablite.groupby_utils.GroupBy.product = 'Product' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.first","title":"tablite.groupby_utils.GroupBy.first = 'First' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.last","title":"tablite.groupby_utils.GroupBy.last = 'Last' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count","title":"tablite.groupby_utils.GroupBy.count = 'Count' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count_unique","title":"tablite.groupby_utils.GroupBy.count_unique = 'CountUnique' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.avg","title":"tablite.groupby_utils.GroupBy.avg = 'Average' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.stdev","title":"tablite.groupby_utils.GroupBy.stdev = 'StandardDeviation' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.median","title":"tablite.groupby_utils.GroupBy.median = 'Median' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.mode","title":"tablite.groupby_utils.GroupBy.mode = 'Mode' class-attribute instance-attribute ","text":""},{"location":"reference/import_utils/","title":"Import utils","text":""},{"location":"reference/import_utils/#tablite.import_utils","title":"tablite.import_utils ","text":""},{"location":"reference/import_utils/#tablite.import_utils-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.file_readers","title":"tablite.import_utils.file_readers = {'fods': excel_reader, 'json': excel_reader, 'html': from_html, 'hdf5': from_hdf5, 'simple': excel_reader, 'rst': excel_reader, 'mediawiki': excel_reader, 'xlsx': excel_reader, 'xls': excel_reader, 'xlsm': excel_reader, 'csv': text_reader, 'tsv': text_reader, 'txt': text_reader, 'ods': ods_reader} module-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.valid_readers","title":"tablite.import_utils.valid_readers = ','.join(list(file_readers.keys())) module-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils-classes","title":"Classes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig","title":"tablite.import_utils.TRconfig(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields) ","text":" Bases: object Source code in tablite/import_utils.py def __init__(\n self,\n source,\n destination,\n start,\n end,\n guess_datatypes,\n delimiter,\n text_qualifier,\n text_escape_openings,\n text_escape_closures,\n strip_leading_and_tailing_whitespace,\n encoding,\n newline_offsets,\n fields\n) -> None:\n self.source = source\n self.destination = destination\n self.start = start\n self.end = end\n self.guess_datatypes = guess_datatypes\n self.delimiter = delimiter\n self.text_qualifier = text_qualifier\n self.text_escape_openings = text_escape_openings\n self.text_escape_closures = text_escape_closures\n self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n self.encoding = encoding\n self.newline_offsets = newline_offsets\n self.fields = fields\n type_check(start, int),\n type_check(end, int),\n type_check(delimiter, str),\n type_check(text_qualifier, (str, type(None))),\n type_check(text_escape_openings, str),\n type_check(text_escape_closures, str),\n type_check(encoding, str),\n type_check(strip_leading_and_tailing_whitespace, bool),\n type_check(newline_offsets, list)\n type_check(fields, dict)\n "},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.source","title":"tablite.import_utils.TRconfig.source = source instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.destination","title":"tablite.import_utils.TRconfig.destination = destination instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.start","title":"tablite.import_utils.TRconfig.start = start instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.end","title":"tablite.import_utils.TRconfig.end = end instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.guess_datatypes","title":"tablite.import_utils.TRconfig.guess_datatypes = guess_datatypes instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.delimiter","title":"tablite.import_utils.TRconfig.delimiter = delimiter instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_qualifier","title":"tablite.import_utils.TRconfig.text_qualifier = text_qualifier instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_openings","title":"tablite.import_utils.TRconfig.text_escape_openings = text_escape_openings instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_closures","title":"tablite.import_utils.TRconfig.text_escape_closures = text_escape_closures instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace","title":"tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.encoding","title":"tablite.import_utils.TRconfig.encoding = encoding instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.newline_offsets","title":"tablite.import_utils.TRconfig.newline_offsets = newline_offsets instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.fields","title":"tablite.import_utils.TRconfig.fields = fields instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.copy","title":"tablite.import_utils.TRconfig.copy() ","text":"Source code in tablite/import_utils.py def copy(self):\n return TRconfig(**self.dict())\n "},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.dict","title":"tablite.import_utils.TRconfig.dict() ","text":"Source code in tablite/import_utils.py def dict(self):\n return {k: v for k, v in self.__dict__.items() if not (k.startswith(\"_\") or callable(v))}\n "},{"location":"reference/import_utils/#tablite.import_utils-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.from_pandas","title":"tablite.import_utils.from_pandas(T, df) ","text":"Creates Table using pd.to_dict('list') similar to: import pandas as pd df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}) df a b 0 1 4 1 2 5 2 3 6 df.to_dict('list') t = Table.from_dict(df.to_dict('list)) t.show() +===+===+===+ | # | a | b | |row|int|int| +---+---+---+ | 0 | 1| 4| | 1 | 2| 5| | 2 | 3| 6| +===+===+===+ Source code in tablite/import_utils.py def from_pandas(T, df):\n \"\"\"\n Creates Table using pd.to_dict('list')\n\n similar to:\n >>> import pandas as pd\n >>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n >>> df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n >>> df.to_dict('list')\n {'a': [1, 2, 3], 'b': [4, 5, 6]}\n\n >>> t = Table.from_dict(df.to_dict('list))\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 4|\n | 1 | 2| 5|\n | 2 | 3| 6|\n +===+===+===+\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n\n return T(columns=df.to_dict(\"list\")) # noqa\n "},{"location":"reference/import_utils/#tablite.import_utils.from_hdf5","title":"tablite.import_utils.from_hdf5(T, path, tqdm=_tqdm, pbar=None) ","text":"imports an exported hdf5 table. Note that some loss of type information is to be expected in columns of mixed type: t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ Source code in tablite/import_utils.py def from_hdf5(T, path, tqdm=_tqdm, pbar=None):\n \"\"\"\n imports an exported hdf5 table.\n\n Note that some loss of type information is to be expected in columns of mixed type:\n >>> t.show(dtype=True)\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int|\n +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n >>> t.to_hdf5(filename)\n >>> t2 = Table.from_hdf5(filename)\n >>> t2.show(dtype=True)\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int|\n +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n import h5py\n\n type_check(path, Path)\n t = T()\n with h5py.File(path, \"r\") as h5:\n for col_name in h5.keys():\n dset = h5[col_name]\n arr = np.array(dset[:])\n if arr.dtype == object:\n arr = np.array(DataTypes.guess([v.decode(\"utf-8\") for v in arr]))\n t[col_name] = arr\n return t\n "},{"location":"reference/import_utils/#tablite.import_utils.from_json","title":"tablite.import_utils.from_json(T, jsn) ","text":"Imports tables exported using .to_json Source code in tablite/import_utils.py def from_json(T, jsn):\n \"\"\"\n Imports tables exported using .to_json\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n import json\n\n type_check(jsn, str)\n d = json.loads(jsn)\n return T(columns=d[\"columns\"])\n "},{"location":"reference/import_utils/#tablite.import_utils.from_html","title":"tablite.import_utils.from_html(T, path, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/import_utils.py def from_html(T, path, tqdm=_tqdm, pbar=None):\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n type_check(path, Path)\n\n if pbar is None:\n total = path.stat().st_size\n pbar = tqdm(total=total, desc=\"from_html\", disable=Config.TQDM_DISABLE)\n\n row_start, row_end = \"<tr>\", \"</tr>\"\n value_start, value_end = \"<th>\", \"</th>\"\n chunk = \"\"\n t = None # will be T()\n start, end = 0, 0\n data = {}\n with path.open(\"r\") as fi:\n while True:\n start = chunk.find(row_start, start) # row tag start\n end = chunk.find(row_end, end) # row tag end\n if start == -1 or end == -1:\n new = fi.read(100_000)\n pbar.update(len(new))\n if new == \"\":\n break\n chunk += new\n continue\n # get indices from chunk\n row = chunk[start + len(row_start) : end]\n fields = [v.rstrip(value_end) for v in row.split(value_start)]\n if not data:\n headers = fields[:]\n data = {f: [] for f in headers}\n continue\n else:\n for field, header in zip(fields, headers):\n data[header].append(field)\n\n chunk = chunk[end + len(row_end) :]\n\n if len(data[headers[0]]) == Config.PAGE_SIZE:\n if t is None:\n t = T(columns=data)\n else:\n for k, v in data.items():\n t[k].extend(DataTypes.guess(v))\n data = {f: [] for f in headers}\n\n for k, v in data.items():\n t[k].extend(DataTypes.guess(v))\n return t\n "},{"location":"reference/import_utils/#tablite.import_utils.excel_reader","title":"tablite.import_utils.excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs) ","text":"returns Table from excel **kwargs are excess arguments that are ignored. Source code in tablite/import_utils.py def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs):\n \"\"\"\n returns Table from excel\n\n **kwargs are excess arguments that are ignored.\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n\n book = openpyxl.load_workbook(path, read_only=True, data_only=True)\n\n if sheet is None: # help the user.\n \"\"\"\n If no sheet specified, assume first sheet.\n\n Reasoning:\n Pandas ODS reader does that, so this preserves parity and it might be expected by users.\n If we don't know the sheet name but only have single sheet,\n we would need to take extra steps to find out the name of the sheet.\n We already make assumptions in case of column selection,\n when columns are None, we import all of them.\n \"\"\"\n sheet = book.sheetnames[0]\n elif sheet not in book.sheetnames:\n raise ValueError(f\"sheet not found: {sheet}\")\n\n if not (isinstance(start, int) and start >= 0):\n raise ValueError(\"expected start as an integer >=0\")\n if not (isinstance(limit, int) and limit > 0):\n raise ValueError(\"expected limit as integer > 0\")\n\n worksheet = book[sheet]\n fixup_worksheet(worksheet)\n\n try:\n it_header = worksheet.iter_rows(min_row=header_row_index + 1)\n while True:\n # get the first row to know our headers or the number of columns\n row = [c.value for c in next(it_header)]\n break\n fields = [str(c) if c is not None else \"\" for c in row] # excel is offset by 1\n except StopIteration:\n # excel was empty, return empty table\n return T()\n\n if not first_row_has_headers:\n # since the first row did not contain headers, we use the column count to populate header names\n fields = [str(i) for i in range(len(fields))]\n\n if columns is None:\n # no columns were specified by user to import, that means we import all of the them\n columns = []\n\n for f in fields:\n # fixup the duplicate column names\n columns.append(unique_name(f, columns))\n\n field_dict = {k: i for i, k in enumerate(columns)}\n else:\n field_dict = {}\n\n for k, i in ((k, fields.index(k)) for k in columns):\n # fixup the duplicate column names\n field_dict[unique_name(k, field_dict.keys())] = i\n\n # calculate our data rows iterator offset\n it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1\n\n # attempt to fetch number of rows in the sheet\n total_rows = worksheet.max_row\n real_tqdm = True\n\n if total_rows is None:\n # i don't know what causes it but max_row can be None in some cases, so we don't know how large the dataset is\n total_rows = it_offset + limit\n real_tqdm = False\n\n # create the actual data rows iterator\n it_rows = worksheet.iter_rows(min_row=it_offset, max_row=min(it_offset+limit, total_rows))\n it_used_indices = list(field_dict.values())\n\n # filter columns that we're not going to use\n it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows)\n\n # create page directory\n workdir = Path(Config.workdir) / Config.pid\n pagesdir = workdir/\"pages\"\n pagesdir.mkdir(exist_ok=True, parents=True)\n\n field_names = list(field_dict.keys())\n column_count = len(field_names)\n\n page_fhs = None\n\n # prepopulate the table with columns\n table = T()\n for name in field_names:\n table[name] = Column(table.path)\n\n pbar_fname = path.name\n if len(pbar_fname) > 20:\n pbar_fname = pbar_fname[0:10] + \"...\" + pbar_fname[-7:]\n\n if real_tqdm:\n # we can create a true tqdm progress bar, make one\n tqdm_iter = tqdm(it_rows_filtered, total=total_rows, desc=f\"importing excel: {pbar_fname}\")\n else:\n \"\"\"\n openpyxls was unable to precalculate the size of the excel for whatever reason\n forcing recalc would require parsing entire file\n drop the progress bar in that case, just show iterations\n\n as an alternative we can use \u03a3=1/x but it just doesn't look good, show iterations per second instead\n \"\"\"\n tqdm_iter = tqdm(it_rows_filtered, desc=f\"importing excel: {pbar_fname}\")\n\n tqdm_iter = iter(tqdm_iter)\n\n idx = 0\n\n while True:\n try:\n row = next(tqdm_iter)\n except StopIteration:\n break # because in some cases we can't know the size of excel to set the upper iterator limit we loop until stop iteration is encountered\n\n if skip_empty == \"ALL\" and all(v is None for v in row):\n continue\n elif skip_empty == \"ANY\" and any(v is None for v in row):\n continue\n\n if idx % Config.PAGE_SIZE == 0:\n if page_fhs is not None:\n # we reached the max page file size, fix the pages\n [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n page_fhs = [None] * column_count\n\n for cidx in range(column_count):\n # allocate new pages\n pg_path = pagesdir / f\"{next(Page.ids)}.npy\"\n page_fhs[cidx] = open(pg_path, \"wb\")\n\n for fh, value in zip(page_fhs, row):\n \"\"\"\n since excel types are already cast into appropriate type we're going to do two passes per page\n\n we create our temporary custom format:\n packed type|packed byte count|packed bytes|...\n\n available types:\n * q - int64\n * d - float64\n * s - string\n * b - boolean\n * n - none\n * p - pickled (date, time, datetime)\n \"\"\"\n dtype = type(value)\n\n if dtype == int:\n ptype, bytes_ = b'q', struct.pack('q', value) # pack int as int64\n elif dtype == float:\n ptype, bytes_ = b'd', struct.pack('d', value) # pack float as float64\n elif dtype == str:\n ptype, bytes_ = b's', value.encode(\"utf-8\") # pack string\n elif dtype == bool:\n ptype, bytes_ = b'b', b'1' if value else b'0' # pack boolean\n elif value is None:\n ptype, bytes_ = b'n', b'' # pack none\n elif dtype in [date, time, datetime]:\n ptype, bytes_ = b'p', pkl.dumps(value) # pack object types via pickle\n else:\n raise NotImplementedError()\n\n byte_count = struct.pack('I', len(bytes_)) # pack our payload size, i doubt payload size can be over uint32\n\n # dump object to file\n fh.write(ptype)\n fh.write(byte_count)\n fh.write(bytes_)\n\n idx = idx + 1\n\n if page_fhs is not None:\n # we reached end of the loop, fix the pages\n [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n return table\n "},{"location":"reference/import_utils/#tablite.import_utils.ods_reader","title":"tablite.import_utils.ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, **kwargs) ","text":"returns Table from .ODS Source code in tablite/import_utils.py def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, **kwargs):\n \"\"\"\n returns Table from .ODS\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n\n if sheet is None:\n data = read_excel(str(path), header=None) # selects first sheet\n else:\n data = read_excel(str(path), sheet_name=sheet, header=None)\n\n data[isna(data)] = None # convert any empty cells to None\n data = data.to_numpy().tolist() # convert pandas to list\n\n if skip_empty == \"ALL\" or skip_empty == \"ANY\":\n \"\"\" filter out all rows based on predicate that come after header row \"\"\"\n fn_filter = any if skip_empty == \"ALL\" else all # this is intentional\n data = [\n row\n for ridx, row in enumerate(data)\n if ridx < header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)\n ]\n\n data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes\n\n if not (isinstance(start, int) and start >= 0):\n raise ValueError(\"expected start as an integer >=0\")\n if not (isinstance(limit, int) and limit > 0):\n raise ValueError(\"expected limit as integer > 0\")\n\n t = T()\n\n used_columns_names = set()\n for ix, value in enumerate(data[header_row_index]):\n if first_row_has_headers:\n header, start_row_pos = \"\" if value is None else str(value), (1 + header_row_index)\n else:\n header, start_row_pos = f\"_{ix + 1}\", (0 + header_row_index)\n\n if columns is not None:\n if header not in columns:\n continue\n\n unique_column_name = unique_name(str(header), used_columns_names)\n used_columns_names.add(unique_column_name)\n\n column_values = data[start_row_pos : start_row_pos + limit, ix]\n\n t[unique_column_name] = column_values\n return t\n "},{"location":"reference/import_utils/#tablite.import_utils.text_reader_task","title":"tablite.import_utils.text_reader_task(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields) ","text":"PARALLEL TASK FUNCTION reads columnsname + path[start:limit] into hdf5. source: csv or txt file destination: filename for page. start: int: start of page. end: int: end of page. guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess delimiter: ',' ';' or '|' text_qualifier: str: commonly \" text_escape_openings: str: default: \"({[ text_escape_closures: str: default: ]})\" strip_leading_and_tailing_whitespace: bool encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN') Source code in tablite/import_utils.py def text_reader_task(\n source,\n destination,\n start,\n end,\n guess_datatypes,\n delimiter,\n text_qualifier,\n text_escape_openings,\n text_escape_closures,\n strip_leading_and_tailing_whitespace,\n encoding,\n newline_offsets,\n fields\n):\n \"\"\"PARALLEL TASK FUNCTION\n reads columnsname + path[start:limit] into hdf5.\n\n source: csv or txt file\n destination: filename for page.\n start: int: start of page.\n end: int: end of page.\n guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess\n delimiter: ',' ';' or '|'\n text_qualifier: str: commonly \\\"\n text_escape_openings: str: default: \"({[\n text_escape_closures: str: default: ]})\"\n strip_leading_and_tailing_whitespace: bool\n encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')\n \"\"\"\n if isinstance(source, str):\n source = Path(source)\n type_check(source, Path)\n if not source.exists():\n raise FileNotFoundError(f\"File not found: {source}\")\n type_check(destination, list)\n\n # declare CSV dialect.\n delim = delimiter\n\n class Dialect(csv.Dialect):\n delimiter = delim\n quotechar = '\"' if text_qualifier is None else text_qualifier\n escapechar = '\\\\'\n doublequote = True\n quoting = csv.QUOTE_MINIMAL\n skipinitialspace = False if strip_leading_and_tailing_whitespace is None else strip_leading_and_tailing_whitespace\n lineterminator = \"\\n\"\n\n with source.open(\"r\", encoding=encoding, errors=\"ignore\") as fi: # --READ\n fi.seek(newline_offsets[start])\n reader = csv.reader(fi, dialect=Dialect)\n\n # if there's an issue with file handlers on windows, we can make a special case for windows where the file is opened on demand and appended instead of opening all handlers at once\n page_file_handlers = [open(f, mode=\"wb\") for f in destination]\n\n # identify longest str\n longest_str = [1 for _ in range(len(destination))]\n for row in (next(reader) for _ in range(end - start)):\n for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n longest_str[idx] = max(longest_str[idx], len(c))\n\n column_formats = [f\"<U{i}\" for i in longest_str]\n for idx, cf in enumerate(column_formats):\n _create_numpy_header(cf, (end - start, ), page_file_handlers[idx])\n\n # write page arrays to files\n fi.seek(newline_offsets[start])\n for row in (next(reader) for _ in range(end - start)):\n for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n cbytes = np.asarray(c, dtype=column_formats[idx]).tobytes()\n page_file_handlers[idx].write(cbytes)\n\n [phf.close() for phf in page_file_handlers]\n "},{"location":"reference/import_utils/#tablite.import_utils.text_reader","title":"tablite.import_utils.text_reader(T, path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline, guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, delimiter, text_escape_openings, text_escape_closures, tqdm=_tqdm, **kwargs) ","text":"Source code in tablite/import_utils.py def text_reader(\n T,\n path,\n columns,\n first_row_has_headers,\n header_row_index,\n encoding,\n start,\n limit,\n newline,\n guess_datatypes,\n text_qualifier,\n strip_leading_and_tailing_whitespace,\n skip_empty,\n delimiter,\n text_escape_openings,\n text_escape_closures,\n tqdm=_tqdm,\n **kwargs,\n):\n if encoding is None:\n encoding = get_encoding(path, nbytes=ENCODING_GUESS_BYTES)\n\n enc = py_to_nim_encoding(encoding)\n pid = Config.workdir / Config.pid\n kwargs = {}\n\n if first_row_has_headers is not None:\n kwargs[\"first_row_has_headers\"] = first_row_has_headers\n if header_row_index is not None:\n kwargs[\"header_row_index\"] = header_row_index\n if columns is not None:\n kwargs[\"columns\"] = columns\n if start is not None:\n kwargs[\"start\"] = start\n if limit is not None and limit != sys.maxsize:\n kwargs[\"limit\"] = limit\n if guess_datatypes is not None:\n kwargs[\"guess_datatypes\"] = guess_datatypes\n if newline is not None:\n kwargs[\"newline\"] = newline\n if delimiter is not None:\n kwargs[\"delimiter\"] = delimiter\n if text_qualifier is not None:\n kwargs[\"text_qualifier\"] = text_qualifier\n kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n else:\n kwargs[\"quoting\"] = \"QUOTE_NONE\"\n if strip_leading_and_tailing_whitespace is not None:\n kwargs[\"strip_leading_and_tailing_whitespace\"] = strip_leading_and_tailing_whitespace\n\n if skip_empty is None:\n kwargs[\"skip_empty\"] = \"NONE\"\n else:\n kwargs[\"skip_empty\"] = skip_empty\n\n return nimlite.text_reader(\n T, pid, path, enc,\n **kwargs,\n tqdm=tqdm\n )\n "},{"location":"reference/import_utils/#tablite.import_utils-modules","title":"Modules","text":""},{"location":"reference/imputation/","title":"Imputation","text":""},{"location":"reference/imputation/#tablite.imputation","title":"tablite.imputation ","text":""},{"location":"reference/imputation/#tablite.imputation-classes","title":"Classes","text":""},{"location":"reference/imputation/#tablite.imputation-functions","title":"Functions","text":""},{"location":"reference/imputation/#tablite.imputation.imputation","title":"tablite.imputation.imputation(T, targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm, pbar=None) ","text":"In statistics, imputation is the process of replacing missing data with substituted values. See more: https://en.wikipedia.org/wiki/Imputation_(statistics) PARAMETER DESCRIPTION table source table. TYPE: Table targets column names to find and replace missing values TYPE: str or list of strings missing values to be replaced. TYPE: None or iterable DEFAULT: None method method to be used for replacement. Options: 'carry forward': takes the previous value, and carries forward into fields where values are missing. +: quick. Realistic on time series. -: Can produce strange outliers. 'mean': calculates the column mean (exclude missing ) and copies the mean in as replacement. +: quick -: doesn't work on text. Causes data set to drift towards the mean. 'mode': calculates the column mode (exclude missing ) and copies the mean in as replacement. +: quick -: most frequent value becomes over-represented in the sample 'nearest neighbour': calculates normalised distance between items in source columns selects nearest neighbour and copies value as replacement. +: works for any datatype. -: computationally intensive (e.g. slow) TYPE: str DEFAULT: 'carry forward' sources NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used. TYPE: list of strings DEFAULT: None RETURNS DESCRIPTION table table with replaced values. Source code in tablite/imputation.py def imputation(T, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm, pbar=None):\n \"\"\"\n In statistics, imputation is the process of replacing missing data with substituted values.\n\n See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n Args:\n table (Table): source table.\n\n targets (str or list of strings): column names to find and\n replace missing values\n\n missing (None or iterable): values to be replaced.\n\n method (str): method to be used for replacement. Options:\n\n 'carry forward':\n takes the previous value, and carries forward into fields\n where values are missing.\n +: quick. Realistic on time series.\n -: Can produce strange outliers.\n\n 'mean':\n calculates the column mean (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: doesn't work on text. Causes data set to drift towards the mean.\n\n 'mode':\n calculates the column mode (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: most frequent value becomes over-represented in the sample\n\n 'nearest neighbour':\n calculates normalised distance between items in source columns\n selects nearest neighbour and copies value as replacement.\n +: works for any datatype.\n -: computationally intensive (e.g. slow)\n\n sources (list of strings): NEAREST NEIGHBOUR ONLY\n column names to be used during imputation.\n if None or empty, all columns will be used.\n\n Returns:\n table: table with replaced values.\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if isinstance(targets, str) and targets not in T.columns:\n targets = [targets]\n if isinstance(targets, list):\n for name in targets:\n if not isinstance(name, str):\n raise TypeError(f\"expected str, not {type(name)}\")\n if name not in T.columns:\n raise ValueError(f\"target item {name} not a column name in T.columns:\\n{T.columns}\")\n else:\n raise TypeError(\"Expected source as list of column names\")\n\n if missing is None:\n missing = {None}\n else:\n missing = set(missing)\n\n if method == \"nearest neighbour\":\n if sources in (None, []):\n sources = list(T.columns)\n if isinstance(sources, str):\n sources = [sources]\n if isinstance(sources, list):\n for name in sources:\n if not isinstance(name, str):\n raise TypeError(f\"expected str, not {type(name)}\")\n if name not in T.columns:\n raise ValueError(f\"source item {name} not a column name in T.columns:\\n{T.columns}\")\n else:\n raise TypeError(\"Expected source as list of column names\")\n\n methods = [\"nearest neighbour\", \"mean\", \"mode\", \"carry forward\"]\n\n if method == \"carry forward\":\n return carry_forward(T, targets, missing, tqdm=tqdm, pbar=pbar)\n elif method in {\"mean\", \"mode\"}:\n return stats_method(T, targets, missing, method, tqdm=tqdm, pbar=pbar)\n elif method == \"nearest neighbour\":\n return nearest_neighbour(T, sources, missing, targets, tqdm=tqdm)\n else:\n raise ValueError(f\"method {method} not recognised amonst known methods: {list(methods)})\")\n "},{"location":"reference/imputation/#tablite.imputation.carry_forward","title":"tablite.imputation.carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/imputation.py def carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None):\n assert isinstance(missing, set)\n\n if pbar is None:\n total = len(targets) * len(T)\n pbar = tqdm(total=total, desc=\"imputation.carry_forward\", disable=Config.TQDM_DISABLE)\n\n new = T.copy()\n for name in T.columns:\n if name in targets:\n data = T[name][:] # create copy\n last_value = None\n for ix, v in enumerate(data):\n if v in missing: # perform replacement\n data[ix] = last_value\n else: # keep last value.\n last_value = v\n pbar.update(1)\n new[name] = data\n else:\n new[name] = T[name]\n\n return new\n "},{"location":"reference/imputation/#tablite.imputation.stats_method","title":"tablite.imputation.stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/imputation.py def stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None):\n assert isinstance(missing, set)\n\n if pbar is None:\n total = len(targets)\n pbar = tqdm(total=total, desc=f\"imputation.{method}\", disable=Config.TQDM_DISABLE)\n\n new = T.copy()\n for name in T.columns:\n if name in targets:\n col = T.columns[name]\n assert isinstance(col, Column)\n\n hist_values, hist_counts = col.histogram()\n\n for m in missing:\n try:\n idx = hist_values.index(m)\n hist_counts[idx] = 0\n except ValueError:\n pass\n\n stats = summary_statistics(hist_values, hist_counts)\n\n new_value = stats[method]\n col.replace(mapping={m: new_value for m in missing})\n new[name] = col\n pbar.update(1)\n else:\n new[name] = T[name] # no entropy, keep as is.\n\n return new\n "},{"location":"reference/imputation/#tablite.imputation-modules","title":"Modules","text":""},{"location":"reference/joins/","title":"Joins","text":""},{"location":"reference/joins/#tablite.joins","title":"tablite.joins ","text":""},{"location":"reference/joins/#tablite.joins-classes","title":"Classes","text":""},{"location":"reference/joins/#tablite.joins-functions","title":"Functions","text":""},{"location":"reference/joins/#tablite.joins.join","title":"tablite.joins.join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], kind: str = 'inner', merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"short-cut for all join functions. PARAMETER DESCRIPTION T left table TYPE: Table other right table TYPE: Table left_keys list of keys for the join from left table. TYPE: list right_keys list of keys for the join from right table. TYPE: list left_columns list of columns names to retain from left table. If None, all are retained. TYPE: list right_columns list of columns names to retain from right table. If None, all are retained. TYPE: list kind 'inner', 'left', 'outer', 'cross'. Defaults to \"inner\". TYPE: str DEFAULT: 'inner' tqdm tqdm progress counter. Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm pbar tqdm.progressbar. Defaults to None. TYPE: pbar DEFAULT: None RAISES DESCRIPTION ValueError if join type is unknown. RETURNS DESCRIPTION Table joined table. Example: \"inner\" SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n Tablite: >>> inner_join = numbers.inner_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n)\n Example: \"left\" SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n Tablite: >>> left_join = numbers.left_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n)\n Example: \"outer\" SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n Tablite: >>> outer_join = numbers.outer_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n Example: \"cross\" CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table Source code in tablite/joins.py def join(\n T: BaseTable,\n other: BaseTable,\n left_keys: List[str],\n right_keys: List[str],\n left_columns: Union[List[str], None],\n right_columns: Union[List[str], None],\n kind: str = \"inner\",\n merge_keys: bool = False,\n tqdm=_tqdm,\n pbar=None,\n):\n \"\"\"short-cut for all join functions.\n\n Args:\n T (Table): left table\n other (Table): right table\n left_keys (list): list of keys for the join from left table.\n right_keys (list): list of keys for the join from right table.\n left_columns (list): list of columns names to retain from left table.\n If None, all are retained.\n right_columns (list): list of columns names to retain from right table.\n If None, all are retained.\n kind (str, optional): 'inner', 'left', 'outer', 'cross'. Defaults to \"inner\".\n tqdm (tqdm, optional): tqdm progress counter. Defaults to _tqdm.\n pbar (tqdm.pbar, optional): tqdm.progressbar. Defaults to None.\n\n Raises:\n ValueError: if join type is unknown.\n\n Returns:\n Table: joined table.\n\n Example: \"inner\"\n ```\n SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n ```\n Tablite: \n ```\n >>> inner_join = numbers.inner_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n ```\n\n Example: \"left\" \n ```\n SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n ```\n Tablite: \n ```\n >>> left_join = numbers.left_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n ```\n\n Example: \"outer\"\n ```\n SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n ```\n\n Tablite: \n ```\n >>> outer_join = numbers.outer_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n ```\n\n Example: \"cross\"\n\n CROSS JOIN returns the Cartesian product of rows from tables in the join.\n In other words, it will produce rows which combine each row from the first table\n with each row from the second table\n \"\"\"\n if left_columns is None:\n left_columns = list(T.columns)\n if right_columns is None:\n right_columns = list(other.columns)\n assert merge_keys in {True,False}\n\n _jointype_check(T, other, left_keys, right_keys, left_columns, right_columns)\n\n return _join(kind, T,other,left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys,\n tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.inner_join","title":"tablite.joins.inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"inner\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.left_join","title":"tablite.joins.left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"left\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.outer_join","title":"tablite.joins.outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"outer\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.cross_join","title":"tablite.joins.cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"cross\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/lookup/","title":"Lookup","text":""},{"location":"reference/lookup/#tablite.lookup","title":"tablite.lookup ","text":""},{"location":"reference/lookup/#tablite.lookup-attributes","title":"Attributes","text":""},{"location":"reference/lookup/#tablite.lookup-classes","title":"Classes","text":""},{"location":"reference/lookup/#tablite.lookup-functions","title":"Functions","text":""},{"location":"reference/lookup/#tablite.lookup.lookup","title":"tablite.lookup.lookup(T, other, *criteria, all=True, tqdm=_tqdm) ","text":"function for looking up values in other according to criteria in ascending order. :param: T: Table :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=ANY OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare. Examples: comparison of two columns: ('column A', \"==\", 'column B')\n compare value from column 'Date' with date 24/12. ('Date', \"<\", DataTypes.date(24,12) )\n uses custom function to compare value from column 'text 1' with value from column 'text 2' f = lambda L,R: all( ord(L) < ord(R) )\n('text 1', f, 'text 2')\n Source code in tablite/lookup.py def lookup(T, other, *criteria, all=True, tqdm=_tqdm):\n \"\"\"function for looking up values in `other` according to criteria in ascending order.\n :param: T: Table \n :param: other: Table sorted in ascending search order.\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n (LEFT, OPERATOR, RIGHT)\n :param: all: boolean: True=ALL, False=ANY\n\n OPERATOR must be a callable that returns a boolean\n LEFT must be a value that the OPERATOR can compare.\n RIGHT must be a value that the OPERATOR can compare.\n\n Examples:\n comparison of two columns:\n\n ('column A', \"==\", 'column B')\n\n compare value from column 'Date' with date 24/12.\n\n ('Date', \"<\", DataTypes.date(24,12) )\n\n uses custom function to compare value from column\n 'text 1' with value from column 'text 2'\n\n f = lambda L,R: all( ord(L) < ord(R) )\n ('text 1', f, 'text 2')\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n sub_cls_check(other, BaseTable)\n\n all = all\n any = not all\n\n ops = lookup_ops\n\n functions, left_criteria, right_criteria = [], set(), set()\n\n for left, op, right in criteria:\n left_criteria.add(left)\n right_criteria.add(right)\n if callable(op):\n pass # it's a custom function.\n else:\n op = ops.get(op, None)\n if not callable(op):\n raise ValueError(f\"{op} not a recognised operator for comparison.\")\n\n functions.append((op, left, right))\n left_columns = [n for n in left_criteria if n in T.columns]\n right_columns = [n for n in right_criteria if n in other.columns]\n\n result_index = np.empty(shape=(len(T)), dtype=np.int64)\n cache = {}\n left = T[left_columns]\n Constr = type(T)\n if isinstance(left, Column):\n tmp, left = left, Constr()\n left[left_columns[0]] = tmp\n right = other[right_columns]\n if isinstance(right, Column):\n tmp, right = right, Constr()\n right[right_columns[0]] = tmp\n assert isinstance(left, BaseTable)\n assert isinstance(right, BaseTable)\n\n for ix, row1 in tqdm(enumerate(left.rows), total=len(T), disable=Config.TQDM_DISABLE):\n row1_tup = tuple(row1)\n row1d = {name: value for name, value in zip(left_columns, row1)}\n row1_hash = hash(row1_tup)\n\n match_found = True if row1_hash in cache else False\n\n if not match_found: # search.\n for row2ix, row2 in enumerate(right.rows):\n row2d = {name: value for name, value in zip(right_columns, row2)}\n\n evaluations = {op(row1d.get(left, left), row2d.get(right, right)) for op, left, right in functions}\n # The evaluations above does a neat trick:\n # as L is a dict, L.get(left, L) will return a value\n # from the columns IF left is a column name. If it isn't\n # the function will treat left as a value.\n # The same applies to right.\n all_ = all and (False not in evaluations)\n any_ = any and True in evaluations\n if all_ or any_:\n match_found = True\n cache[row1_hash] = row2ix\n break\n\n if not match_found: # no match found.\n cache[row1_hash] = -1 # -1 is replacement for None in the index as numpy can't handle Nones.\n\n result_index[ix] = cache[row1_hash]\n\n f = select_processing_method(2 * max(len(T), len(other)), _sp_lookup, _mp_lookup)\n return f(T, other, result_index)\n "},{"location":"reference/match/","title":"Match","text":""},{"location":"reference/match/#tablite.match","title":"tablite.match ","text":""},{"location":"reference/match/#tablite.match-classes","title":"Classes","text":""},{"location":"reference/match/#tablite.match-functions","title":"Functions","text":""},{"location":"reference/match/#tablite.match.match","title":"tablite.match.match(T, other, *criteria, keep_left=None, keep_right=None) ","text":"performs inner join where T matches other and removes rows that do not match. :param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n :param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep. Source code in tablite/match.py def match(T, other, *criteria, keep_left=None, keep_right=None): # lookup and filter combined - drops unmatched rows.\n \"\"\"\n performs inner join where `T` matches `other` and removes rows that do not match.\n\n :param: T: Table\n :param: other: Table\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n Example:\n ('column A', \"==\", 'column B')\n\n This syntax follows the lookup syntax. See Lookup for details.\n\n :param: keep_left: list of columns to keep.\n :param: keep_right: list of right columns to keep.\n \"\"\"\n assert isinstance(T, BaseTable)\n assert isinstance(other, BaseTable)\n if keep_left is None:\n keep_left = [n for n in T.columns]\n else:\n type_check(keep_left, list)\n name_check(T.columns, *keep_left)\n\n if keep_right is None:\n keep_right = [n for n in other.columns]\n else:\n type_check(keep_right, list)\n name_check(other.columns, *keep_right)\n\n indices = np.full(shape=(len(T),), fill_value=-1, dtype=np.int64)\n for arg in criteria:\n b,_,a = arg\n if _ != \"==\":\n raise ValueError(\"match requires A == B. For other logic visit `lookup`\")\n if b not in T.columns:\n raise ValueError(f\"Column {b} not found in T for criteria: {arg}\")\n if a not in other.columns:\n raise ValueError(f\"Column {a} not found in T for criteria: {arg}\")\n\n index_update = find_indices(other[a][:], T[b][:], fill_value=-1)\n indices = merge_indices(indices, index_update)\n\n cls = type(T)\n new = cls()\n for name in T.columns:\n if name in keep_left:\n new[name] = np.compress(indices != -1, T[name][:])\n\n for name in other.columns:\n if name in keep_right:\n new_name = unique_name(name, new.columns)\n primary = np.compress(indices != -1, indices)\n new[new_name] = np.take(other[name][:], primary)\n\n return new\n "},{"location":"reference/match/#tablite.match.find_indices","title":"tablite.match.find_indices(x, y, fill_value=-1) ","text":"finds index of y in x Source code in tablite/match.py def find_indices(x,y, fill_value=-1): # fast.\n \"\"\"\n finds index of y in x\n \"\"\"\n # disassembly of numpy:\n # import numpy as np\n # x = np.array([3, 5, 7, 1, 9, 8, 6, 6])\n # y = np.array([2, 1, 5, 10, 100, 6])\n index = np.argsort(x) # array([3, 0, 1, 6, 7, 2, 5, 4])\n sorted_x = x[index] # array([1, 3, 5, 6, 6, 7, 8, 9])\n sorted_index = np.searchsorted(sorted_x, y) # array([1, 0, 2, 8, 8, 3])\n yindex = np.take(index, sorted_index, mode=\"clip\") # array([0, 3, 1, 4, 4, 6])\n mask = x[yindex] != y # array([ True, False, False, True, True, False])\n indices = np.ma.array(yindex, mask=mask, fill_value=fill_value) \n # masked_array(data=[--, 3, 1, --, --, 6], mask=[ True, False, False, True, True, False], fill_value=999999)\n # --: y[0] not in x\n # 3 : y[1] == x[3]\n # 1 : y[2] == x[1]\n # --: y[3] not in x\n # --: y[4] not in x\n # --: y[5] == x[6]\n result = np.where(~indices.mask, indices.data, -1) \n return result # array([-1, 3, 1, -1, -1, 6])\n "},{"location":"reference/match/#tablite.match.merge_indices","title":"tablite.match.merge_indices(x1, *args, fill_value=-1) ","text":"merges x1 and x2 where Source code in tablite/match.py def merge_indices(x1, *args, fill_value=-1):\n \"\"\"\n merges x1 and x2 where \n \"\"\"\n # dis:\n # >>> AA = array([-1, 3, -1, 5])\n # >>> BB = array([-1, -1, 4, 5])\n new = x1[:] # = AA\n for arg in args:\n mask = (new == fill_value) # array([True, False, True, False])\n new = np.where(mask, arg, new) # array([-1, 3, 4, 5])\n return new # array([-1, 3, 4, 5])\n "},{"location":"reference/merge/","title":"Merge","text":""},{"location":"reference/merge/#tablite.merge","title":"tablite.merge ","text":""},{"location":"reference/merge/#tablite.merge-classes","title":"Classes","text":""},{"location":"reference/merge/#tablite.merge-functions","title":"Functions","text":""},{"location":"reference/merge/#tablite.merge.where","title":"tablite.merge.where(T, criteria, left, right, new) ","text":"takes from LEFT where criteria is True else RIGHT and creates a single new column. :param: T: Table :param: criteria: np.array(bool): if True take left column else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name :returns: T Source code in tablite/merge.py def where(T, criteria, left, right, new):\n \"\"\" takes from LEFT where criteria is True else RIGHT \n and creates a single new column.\n\n :param: T: Table\n :param: criteria: np.array(bool): \n if True take left column\n else take right column\n :param left: (str) column name\n :param right: (str) column name\n :param new: (str) new name\n\n :returns: T\n \"\"\"\n type_check(T, BaseTable)\n if isinstance(criteria, np.ndarray):\n if not criteria.dtype == \"bool\":\n raise TypeError\n else:\n criteria = np.array(criteria, dtype='bool')\n\n new_uq = unique_name(new, list(T.columns))\n T.add_column(new_uq)\n col = T[new_uq]\n\n for start,end in Config.page_steps(len(criteria)):\n left_values = T[left][start:end]\n right_values = T[right][start:end]\n new_values = np.where(criteria, left_values, right_values)\n col.extend(new_values)\n\n if new == right:\n T[right] = T[new_uq] # keep column order\n del T[new_uq]\n del T[left]\n elif new == left:\n T[left] = T[new_uq] # keep column order\n del T[new_uq]\n del T[right]\n else:\n T[new] = T[new_uq]\n del T[left]\n del T[right]\n return T\n "},{"location":"reference/mp_utils/","title":"Mp utils","text":""},{"location":"reference/mp_utils/#tablite.mp_utils","title":"tablite.mp_utils ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-attributes","title":"Attributes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.lookup_ops","title":"tablite.mp_utils.lookup_ops = {'in': _in, 'not in': not_in, '<': operator.lt, '<=': operator.le, '>': operator.gt, '>=': operator.ge, '!=': operator.ne, '==': operator.eq} module-attribute ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops","title":"tablite.mp_utils.filter_ops = {'>': operator.gt, '>=': operator.ge, '==': operator.eq, '<': operator.lt, '<=': operator.le, '!=': operator.ne, 'in': _in} module-attribute ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops_from_text","title":"tablite.mp_utils.filter_ops_from_text = {'gt': '>', 'gteq': '>=', 'eq': '==', 'lt': '<', 'lteq': '<=', 'neq': '!=', 'in': _in} module-attribute ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-classes","title":"Classes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-functions","title":"Functions","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.not_in","title":"tablite.mp_utils.not_in(a, b) ","text":"Source code in tablite/mp_utils.py def not_in(a, b):\n return not operator.contains(str(a), str(b))\n "},{"location":"reference/mp_utils/#tablite.mp_utils.is_mp","title":"tablite.mp_utils.is_mp(fields: int) -> bool ","text":"PARAMETER DESCRIPTION fields number of fields TYPE: int RETURNS DESCRIPTION bool bool Source code in tablite/mp_utils.py def is_mp(fields: int) -> bool:\n \"\"\"\n\n Args:\n fields (int): number of fields\n\n Returns:\n bool\n \"\"\"\n if Config.MULTIPROCESSING_MODE == Config.FORCE:\n return True\n\n if Config.MULTIPROCESSING_MODE == Config.FALSE:\n return False\n\n if fields < Config.SINGLE_PROCESSING_LIMIT:\n return False\n\n if max(psutil.cpu_count(logical=False), 1) < 2:\n return False\n\n return True\n "},{"location":"reference/mp_utils/#tablite.mp_utils.select_processing_method","title":"tablite.mp_utils.select_processing_method(fields, sp, mp) ","text":"PARAMETER DESCRIPTION fields number of fields TYPE: int sp method for single processing TYPE: callable mp method for multiprocessing TYPE: callable RETURNS DESCRIPTION _type_ description Source code in tablite/mp_utils.py def select_processing_method(fields, sp, mp):\n \"\"\"\n\n Args:\n fields (int): number of fields\n sp (callable): method for single processing\n mp (callable): method for multiprocessing\n\n Returns:\n _type_: _description_\n \"\"\"\n return mp if is_mp(fields) else sp\n "},{"location":"reference/mp_utils/#tablite.mp_utils.maskify","title":"tablite.mp_utils.maskify(arr) ","text":"Source code in tablite/mp_utils.py def maskify(arr):\n none_mask = [False] * len(arr) # Setting the default\n\n for i in range(len(arr)):\n if arr[i] is None: # Check if our value is None\n none_mask[i] = True\n arr[i] = 0 # Remove None from the original array\n\n return none_mask\n "},{"location":"reference/mp_utils/#tablite.mp_utils.share_mem","title":"tablite.mp_utils.share_mem(inp_arr, dtype) ","text":"Source code in tablite/mp_utils.py def share_mem(inp_arr, dtype):\n len_ = len(inp_arr)\n size = np.dtype(dtype).itemsize * len_\n shape = (len_,)\n\n out_shm = shared_memory.SharedMemory(create=True, size=size) # the co_processors will read this.\n out_arr_index = np.ndarray(shape, dtype=dtype, buffer=out_shm.buf)\n out_arr_index[:] = inp_arr\n\n return out_arr_index, out_shm\n "},{"location":"reference/mp_utils/#tablite.mp_utils.map_task","title":"tablite.mp_utils.map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end) ","text":"Source code in tablite/mp_utils.py def map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end):\n # connect\n shared_data = shared_memory.SharedMemory(name=data_shm_name)\n data = np.ndarray(shape, dtype=dtype, buffer=shared_data.buf)\n\n shared_index = shared_memory.SharedMemory(name=index_shm_name)\n index = np.ndarray(shape, dtype=np.int64, buffer=shared_index.buf)\n\n shared_target = shared_memory.SharedMemory(name=destination_shm_name)\n target = np.ndarray(shape, dtype=dtype, buffer=shared_target.buf)\n # work\n target[start:end] = np.take(data[start:end], index[start:end])\n # disconnect\n shared_data.close()\n shared_index.close()\n shared_target.close()\n "},{"location":"reference/mp_utils/#tablite.mp_utils.reindex_task","title":"tablite.mp_utils.reindex_task(src, dst, index_shm, shm_shape, start, end) ","text":"Source code in tablite/mp_utils.py def reindex_task(src, dst, index_shm, shm_shape, start, end):\n # connect\n existing_shm = shared_memory.SharedMemory(name=index_shm)\n shared_index = np.ndarray(shm_shape, dtype=np.int64, buffer=existing_shm.buf)\n # work\n array = load_numpy(src)\n new = np.take(array, shared_index[start:end])\n np.save(dst, new, allow_pickle=True, fix_imports=False)\n # disconnect\n existing_shm.close()\n "},{"location":"reference/nimlite/","title":"Nimlite","text":""},{"location":"reference/nimlite/#tablite.nimlite","title":"tablite.nimlite ","text":""},{"location":"reference/nimlite/#tablite.nimlite-attributes","title":"Attributes","text":""},{"location":"reference/nimlite/#tablite.nimlite.paths","title":"tablite.nimlite.paths = sys.argv[:] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.K","title":"tablite.nimlite.K = TypeVar('K', bound=BaseTable) module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidEncoders","title":"tablite.nimlite.ValidEncoders = Literal['ENC_UTF8', 'ENC_UTF16', 'ENC_WIN1250'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidQuoting","title":"tablite.nimlite.ValidQuoting = Literal['QUOTE_MINIMAL', 'QUOTE_ALL', 'QUOTE_NONNUMERIC', 'QUOTE_NONE', 'QUOTE_STRINGS', 'QUOTE_NOTNULL'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidSkipEmpty","title":"tablite.nimlite.ValidSkipEmpty = Literal['NONE', 'ANY', 'ALL'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ColumnSelectorDict","title":"tablite.nimlite.ColumnSelectorDict = TypedDict('ColumnSelectorDict', {'column': str, 'type': Literal['int', 'float', 'bool', 'str', 'date', 'time', 'datetime'], 'allow_empty': Union[bool, None], 'rename': Union[str, None]}) module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterCriteria","title":"tablite.nimlite.FilterCriteria = Literal['>', '>=', '==', '<', '<=', '!=', 'in'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterType","title":"tablite.nimlite.FilterType = Literal['all', 'any'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterDict","title":"tablite.nimlite.FilterDict = TypedDict('FilterDict', {'column1': str, 'value1': Union[str, None], 'criteria': FilterCriteria, 'column2': str, 'value2': Union[str, None]}) module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite-classes","title":"Classes","text":""},{"location":"reference/nimlite/#tablite.nimlite-functions","title":"Functions","text":""},{"location":"reference/nimlite/#tablite.nimlite.get_headers","title":"tablite.nimlite.get_headers(path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, header_row_index: int = 0, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, linecount: int = 10) -> list[list[str]] ","text":"Source code in tablite/nimlite.py def get_headers(\n path: Union[str, Path],\n encoding: ValidEncoders =\"ENC_UTF8\",\n *,\n header_row_index: int=0,\n newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True,\n linecount: int = 10\n) -> list[list[str]]:\n return nl.get_headers(\n path=str(path),\n encoding=encoding,\n newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n header_row_index=header_row_index,\n quoting=quoting,\n linecount=linecount\n )\n "},{"location":"reference/nimlite/#tablite.nimlite.text_reader","title":"tablite.nimlite.text_reader(T: Type[K], pid: str, path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, first_row_has_headers: bool = True, header_row_index: int = 0, columns: List[Union[str, None]] = None, start: Union[str, None] = None, limit: Union[str, None] = None, guess_datatypes: bool = False, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -> K ","text":"Source code in tablite/nimlite.py def text_reader(\n T: Type[K],\n pid: str, path: Union[str, Path],\n encoding: ValidEncoders =\"ENC_UTF8\",\n *,\n first_row_has_headers: bool=True, header_row_index: int=0,\n columns: List[Union[str, None]]=None,\n start: Union[str, None] = None, limit: Union[str, None]=None,\n guess_datatypes: bool =False,\n newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True, skip_empty: ValidSkipEmpty = \"NONE\",\n tqdm=_tqdm\n) -> K:\n assert isinstance(path, Path)\n assert isinstance(pid, Path)\n with tqdm(total=10, desc=f\"importing file\") as pbar:\n table = nl.text_reader(\n pid=str(pid),\n path=str(path),\n encoding=encoding,\n first_row_has_headers=first_row_has_headers, header_row_index=header_row_index,\n columns=columns,\n start=start, limit=limit,\n guess_datatypes=guess_datatypes,\n newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n quoting=quoting,\n strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n skip_empty=skip_empty,\n page_size=Config.PAGE_SIZE\n )\n\n pbar.update(1)\n\n task_info = table[\"task\"]\n task_columns = table[\"columns\"]\n\n ti_tasks = task_info[\"tasks\"]\n ti_import_field_names = task_info[\"import_field_names\"]\n\n is_windows = platform.system() == \"Windows\"\n use_logical = False if is_windows else True\n\n cpus = max(psutil.cpu_count(logical=use_logical), 1)\n\n pbar_step = 4 / max(len(ti_tasks), 1)\n\n class WrapUpdate:\n def update(self, n):\n pbar.update(n * pbar_step)\n\n wrapped_pbar = WrapUpdate()\n\n def next_task(task: Task, page_info):\n wrapped_pbar.update(1)\n return Task(\n nl.text_reader_task,\n *task.args, **task.kwargs, page_info=page_info\n )\n\n tasks = [\n TaskChain(\n Task(\n nl.collect_text_reader_page_info_task,\n task=t,\n task_info=task_info\n ), next_task=next_task\n ) for t in ti_tasks\n ]\n\n is_sp = False\n\n if Config.MULTIPROCESSING_MODE == Config.FALSE:\n is_sp = True\n elif Config.MULTIPROCESSING_MODE == Config.FORCE:\n is_sp = False\n elif Config.MULTIPROCESSING_MODE == Config.AUTO and cpus <= 1 or len(tasks) <= 1:\n is_sp = True\n\n if is_sp:\n res = []\n\n for task in tasks:\n page = task.execute()\n\n res.append(page)\n else:\n with TaskManager(cpus, error_mode=\"exception\") as tm:\n res = tm.execute(tasks, pbar=wrapped_pbar)\n\n col_path = pid\n column_dict = {\n cols: Column(col_path)\n for cols in ti_import_field_names\n }\n\n for res_pages in res:\n col_map = {\n n: res_pages[i]\n for i, n in enumerate(ti_import_field_names)\n }\n\n for k, c in column_dict.items():\n c.pages.append(col_map[k])\n\n if columns is None:\n columns = [c[\"name\"] for c in task_columns]\n\n table_dict = {\n a[\"name\"]: column_dict[b]\n for a, b in zip(task_columns, columns)\n }\n\n pbar.update(pbar.total - pbar.n)\n\n table = T(columns=table_dict)\n\n return table\n "},{"location":"reference/nimlite/#tablite.nimlite.wrap","title":"tablite.nimlite.wrap(str_: str) -> str ","text":"Source code in tablite/nimlite.py def wrap(str_: str) -> str:\n return '\"' + str_.replace('\"', '\\\\\"').replace(\"'\", \"\\\\'\").replace(\"\\n\", \"\\\\n\").replace(\"\\t\", \"\\\\t\") + '\"'\n "},{"location":"reference/nimlite/#tablite.nimlite.column_select","title":"tablite.nimlite.column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -> Tuple[K, K] ","text":"Source code in tablite/nimlite.py def column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -> Tuple[K, K]:\n with tqdm(total=100, desc=\"column select\", bar_format='{desc}: {percentage:.1f}%|{bar}{r_bar}') as pbar:\n T = type(table)\n dir_pid = Config.workdir / Config.pid\n\n col_infos = nl.collect_column_select_info(table, cols, str(dir_pid), pbar)\n\n columns = col_infos[\"columns\"]\n page_count = col_infos[\"page_count\"]\n is_correct_type = col_infos[\"is_correct_type\"]\n desired_column_map = col_infos[\"desired_column_map\"]\n original_pages_map = col_infos[\"original_pages_map\"]\n passed_column_data = col_infos[\"passed_column_data\"]\n failed_column_data = col_infos[\"failed_column_data\"]\n res_cols_pass = col_infos[\"res_cols_pass\"]\n res_cols_fail = col_infos[\"res_cols_fail\"]\n column_names = col_infos[\"column_names\"]\n reject_reason_name = col_infos[\"reject_reason_name\"]\n\n if all(is_correct_type.values()):\n tbl_pass_columns = {\n desired_name: table[desired_info[0]]\n for desired_name, desired_info in desired_column_map.items()\n }\n\n tbl_fail_columns = {\n desired_name: []\n for desired_name in failed_column_data\n }\n\n tbl_pass = T(columns=tbl_pass_columns)\n tbl_fail = T(columns=tbl_fail_columns)\n\n return (tbl_pass, tbl_fail)\n\n task_list_inp = (\n _collect_cs_info(i, columns, res_cols_pass, res_cols_fail, original_pages_map)\n for i in range(page_count)\n )\n\n page_size = Config.PAGE_SIZE\n\n tasks = (\n Task(\n nl.do_slice_convert, str(dir_pid), page_size, columns, reject_reason_name, res_pass, res_fail, desired_column_map, column_names, is_correct_type\n )\n for columns, res_pass, res_fail in task_list_inp\n )\n\n cpu_count = max(psutil.cpu_count(), 1)\n\n if Config.MULTIPROCESSING_MODE == Config.FORCE:\n is_mp = True\n elif Config.MULTIPROCESSING_MODE == Config.FALSE:\n is_mp = False\n elif Config.MULTIPROCESSING_MODE == Config.AUTO:\n is_multithreaded = cpu_count > 1\n is_multipage = page_count > 1\n\n is_mp = is_multithreaded and is_multipage\n\n tbl_pass = T({k: [] for k in passed_column_data})\n tbl_fail = T({k: [] for k in failed_column_data})\n\n converted = []\n step_size = 45 / max(page_count, 1)\n\n if is_mp:\n class WrapUpdate:\n def update(self, n):\n pbar.update(n * step_size)\n\n with TaskManager(min(cpu_count, page_count), error_mode=\"exception\") as tm:\n res = tm.execute(list(tasks), pbar=WrapUpdate())\n\n converted.extend(res)\n else:\n for task in tasks:\n res = task.f(*task.args, **task.kwargs)\n\n converted.append(res)\n pbar.update(step_size)\n\n def extend_table(table, columns):\n for (col_name, pg) in columns:\n table[col_name].pages.append(pg)\n\n for pg_pass, pg_fail in converted:\n extend_table(tbl_pass, pg_pass)\n extend_table(tbl_fail, pg_fail)\n\n pbar.update(pbar.total - pbar.n)\n\n return tbl_pass, tbl_fail\n "},{"location":"reference/nimlite/#tablite.nimlite.read_page","title":"tablite.nimlite.read_page(path: Union[str, Path]) -> np.ndarray ","text":"Source code in tablite/nimlite.py def read_page(path: Union[str, Path]) -> np.ndarray:\n return nl.read_page(str(path))\n "},{"location":"reference/nimlite/#tablite.nimlite.repaginate","title":"tablite.nimlite.repaginate(column: Column) ","text":"Source code in tablite/nimlite.py def repaginate(column: Column):\n nl.repaginate(column)\n "},{"location":"reference/nimlite/#tablite.nimlite.nearest_neighbour","title":"tablite.nimlite.nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm) ","text":"Source code in tablite/nimlite.py def nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm):\n return nl.nearest_neighbour(T, sources, list(missing), targets, tqdm)\n "},{"location":"reference/nimlite/#tablite.nimlite.groupby","title":"tablite.nimlite.groupby(T, keys, functions, tqdm=_tqdm) ","text":"Source code in tablite/nimlite.py def groupby(T, keys, functions, tqdm=_tqdm):\n return nl.groupby(T, keys, functions, tqdm)\n "},{"location":"reference/nimlite/#tablite.nimlite.filter","title":"tablite.nimlite.filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm=_tqdm) ","text":"Source code in tablite/nimlite.py def filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm = _tqdm):\n return nl.filter(table, expressions, type, tqdm)\n "},{"location":"reference/pivots/","title":"Pivots","text":""},{"location":"reference/pivots/#tablite.pivots","title":"tablite.pivots ","text":""},{"location":"reference/pivots/#tablite.pivots-classes","title":"Classes","text":""},{"location":"reference/pivots/#tablite.pivots-functions","title":"Functions","text":""},{"location":"reference/pivots/#tablite.pivots.pivot","title":"tablite.pivots.pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None) ","text":"param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as example: >>> t.show()\n+=====+=====+=====+\n| A | B | C |\n| int | int | int |\n+-----+-----+-----+\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n+=====+=====+=====+\n\n>>> t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n>>> t2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int| str |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0 | 6|Sum(B) | 2|None |None |\n|1 | 5|Sum(B) | 4|None |None |\n|2 | 4|Sum(B) |None | 6|None |\n|3 | 3|Sum(B) |None | 8|None |\n|4 | 2|Sum(B) |None |None | 10|\n|5 | 1|Sum(B) |None |None | 12|\n+===+===+========+=====+=====+=====+\n Source code in tablite/pivots.py def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n \"\"\"\n param: rows: column names to keep as rows\n param: columns: column names to keep as columns\n param: functions: aggregation functions from the Groupby class as\n\n example:\n ```\n >>> t.show()\n +=====+=====+=====+\n | A | B | C |\n | int | int | int |\n +-----+-----+-----+\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n +=====+=====+=====+\n\n >>> t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n >>> t2.show()\n +===+===+========+=====+=====+=====+\n | # | C |function|(A=1)|(A=2)|(A=3)|\n |row|int| str |mixed|mixed|mixed|\n +---+---+--------+-----+-----+-----+\n |0 | 6|Sum(B) | 2|None |None |\n |1 | 5|Sum(B) | 4|None |None |\n |2 | 4|Sum(B) |None | 6|None |\n |3 | 3|Sum(B) |None | 8|None |\n |4 | 2|Sum(B) |None |None | 10|\n |5 | 1|Sum(B) |None |None | 12|\n +===+===+========+=====+=====+=====+\n ```\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if isinstance(rows, str):\n rows = [rows]\n if not all(isinstance(i, str) for i in rows):\n raise TypeError(f\"Expected rows as a list of column names, not {[i for i in rows if not isinstance(i,str)]}\")\n\n if isinstance(columns, str):\n columns = [columns]\n if not all(isinstance(i, str) for i in columns):\n raise TypeError(\n f\"Expected columns as a list of column names, not {[i for i in columns if not isinstance(i, str)]}\"\n )\n\n if not isinstance(values_as_rows, bool):\n raise TypeError(f\"expected sum_on_rows as boolean, not {type(values_as_rows)}\")\n\n keys = rows + columns\n assert isinstance(keys, list)\n\n extra_steps = 2\n\n if pbar is None:\n total = extra_steps\n\n if len(functions) == 0:\n total = total + len(keys)\n else:\n total = total + len(T)\n\n pbar = tqdm(total=total, desc=\"pivot\")\n\n grpby = groupby(T, keys, functions, tqdm=tqdm)\n Constr = type(T)\n\n if len(grpby) == 0: # return empty table. This must be a test?\n pbar.update(extra_steps)\n return Constr()\n\n # split keys to determine grid dimensions\n row_key_index = {}\n col_key_index = {}\n\n r = len(rows)\n c = len(columns)\n g = len(functions)\n\n records = defaultdict(dict)\n\n for row in grpby.rows:\n row_key = tuple(row[:r])\n col_key = tuple(row[r : r + c])\n func_key = tuple(row[r + c :])\n\n if row_key not in row_key_index:\n row_key_index[row_key] = len(row_key_index) # Y\n\n if col_key not in col_key_index:\n col_key_index[col_key] = len(col_key_index) # X\n\n rix = row_key_index[row_key]\n cix = col_key_index[col_key]\n if cix in records:\n if rix in records[cix]:\n raise ValueError(\"this should be empty.\")\n records[cix][rix] = func_key\n\n pbar.update(1)\n result = type(T)()\n\n if values_as_rows: # ---> leads to more rows.\n # first create all columns left to right\n\n n = r + 1 # rows keys + 1 col for function values.\n cols = [[] for _ in range(n)]\n for row, ix in row_key_index.items():\n for col_name, f in functions:\n cols[-1].append(f\"{f}({col_name})\")\n for col_ix, v in enumerate(row):\n cols[col_ix].append(v)\n\n for col_name, values in zip(rows + [\"function\"], cols):\n col_name = unique_name(col_name, result.columns)\n result[col_name] = values\n col_length = len(cols[0])\n cols.clear()\n\n # then populate the sparse matrix.\n for col_key, c in col_key_index.items():\n col_name = \"(\" + \",\".join([f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)]) + \")\"\n col_name = unique_name(col_name, result.columns)\n L = [None for _ in range(col_length)]\n for r, funcs in records[c].items():\n for ix, f in enumerate(funcs):\n L[g * r + ix] = f\n result[col_name] = L\n\n else: # ---> leads to more columns.\n n = r\n cols = [[] for _ in range(n)]\n for row in row_key_index:\n for col_ix, v in enumerate(row):\n cols[col_ix].append(v) # write key columns.\n\n for col_name, values in zip(rows, cols):\n result[col_name] = values\n\n col_length = len(row_key_index)\n\n # now populate the sparse matrix.\n for col_key, c in col_key_index.items(): # select column.\n cols, names = [], []\n\n for f, v in zip(functions, func_key):\n agg_col, func = f\n terms = \",\".join([agg_col] + [f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)])\n col_name = f\"{func}({terms})\"\n col_name = unique_name(col_name, result.columns)\n names.append(col_name)\n cols.append([None for _ in range(col_length)])\n for r, funcs in records[c].items():\n for ix, f in enumerate(funcs):\n cols[ix][r] = f\n for name, col in zip(names, cols):\n result[name] = col\n\n pbar.update(1)\n\n return result\n "},{"location":"reference/pivots/#tablite.pivots.transpose","title":"tablite.pivots.transpose(T, tqdm=_tqdm) ","text":"performs a CCW matrix rotation of the table. Source code in tablite/pivots.py def transpose(T, tqdm=_tqdm):\n \"\"\"performs a CCW matrix rotation of the table.\"\"\"\n sub_cls_check(T, BaseTable)\n\n if len(T.columns) == 0:\n return type(T)()\n\n assert isinstance(T, BaseTable)\n new = type(T)()\n L = list(T.columns)\n new[L[0]] = L[1:]\n for row in tqdm(T.rows, desc=\"table transpose\", total=len(T)):\n new[row[0]] = row[1:]\n return new\n "},{"location":"reference/pivots/#tablite.pivots.pivot_transpose","title":"tablite.pivots.pivot_transpose(T, columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm) ","text":"Transpose a selection of columns to rows. PARAMETER DESCRIPTION columns column names to transpose TYPE: list of column names keep column names to keep (repeat) TYPE: list of column names DEFAULT: None RETURNS DESCRIPTION Table with columns transposed to rows Example transpose columns 1,2 and 3 and transpose the remaining columns, except sum . Input: | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n| 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n| ... | | | | | | | | |\n\n>>> t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun | 456|\n|1234| 2345| 3456| mon | 567|\n|1244| 2445| 4456| mon | 7|\n Source code in tablite/pivots.py def pivot_transpose(T, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n \"\"\"Transpose a selection of columns to rows.\n\n Args:\n columns (list of column names): column names to transpose\n keep (list of column names): column names to keep (repeat)\n\n Returns:\n Table: with columns transposed to rows\n\n Example:\n transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n Input:\n ```\n | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n |------|------|------|-----|-----|-----|-----|-----|------|\n | 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n | 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n | ... | | | | | | | | |\n\n >>> t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n Output:\n |col1| col2| col3| transpose| value|\n |----|-----|-----|----------|------|\n |1234| 2345| 3456| sun | 456|\n |1234| 2345| 3456| mon | 567|\n |1244| 2445| 4456| mon | 7|\n ```\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if not isinstance(columns, list):\n raise TypeError\n\n for i in columns:\n if not isinstance(i, str):\n raise TypeError\n if i not in T.columns:\n raise ValueError\n if columns.count(i)>1:\n raise ValueError(f\"Column {i} appears more than once\")\n\n if keep is None:\n keep = []\n for i in keep:\n if not isinstance(i, str):\n raise TypeError\n if i not in T.columns:\n raise ValueError\n\n if column_name in keep + columns:\n column_name = unique_name(column_name, set_of_names=keep + columns)\n if value_name in keep + columns + [column_name]:\n value_name = unique_name(value_name, set_of_names=keep + columns)\n\n new = type(T)()\n new.add_columns(*keep + [column_name, value_name])\n news = {name: [] for name in new.columns}\n\n n = len(keep)\n\n with tqdm(total=len(T), desc=\"transpose\", disable=Config.TQDM_DISABLE) as pbar:\n it = T[keep + columns].rows if len(keep + columns) > 1 else ((v, ) for v in T[keep + columns])\n\n for ix, row in enumerate(it, start=1):\n keeps = row[:n]\n transposes = row[n:]\n\n for name, value in zip(keep, keeps):\n news[name].extend([value] * len(transposes))\n for name, value in zip(columns, transposes):\n news[column_name].append(name)\n news[value_name].append(value)\n\n if ix % Config.SINGLE_PROCESSING_LIMIT == 0:\n for name, values in news.items():\n new[name].extend(values)\n values.clear()\n\n pbar.update(1)\n\n for name, values in news.items():\n new[name].extend(np.array(values))\n values.clear()\n return new\n "},{"location":"reference/redux/","title":"Redux","text":""},{"location":"reference/redux/#tablite.redux","title":"tablite.redux ","text":""},{"location":"reference/redux/#tablite.redux-attributes","title":"Attributes","text":""},{"location":"reference/redux/#tablite.redux-classes","title":"Classes","text":""},{"location":"reference/redux/#tablite.redux-functions","title":"Functions","text":""},{"location":"reference/redux/#tablite.redux.filter_all","title":"tablite.redux.filter_all(T, **kwargs) ","text":"returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable Examples: t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n return x == 4\ndef g(x):\n return x < 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n return x>=2\n\ndef i(x):\n return x<=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n Source code in tablite/redux.py def filter_all(T, **kwargs):\n \"\"\"\n returns Table for rows where ALL kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n\n Examples:\n\n t = Table()\n t['a'] = [1,2,3,4]\n t['b'] = [10,20,30,40]\n\n def f(x):\n return x == 4\n def g(x):\n return x < 20\n\n t2 = t.any( **{\"a\":f, \"b\":g})\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n t2 = t.any(a=f,b=g)\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n def h(x):\n return x>=2\n\n def i(x):\n return x<=30\n\n t2 = t.all(a=h,b=i)\n assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if not isinstance(kwargs, dict):\n raise TypeError(\"did you forget to add the ** in front of your dict?\")\n if not all([k in T.columns for k in kwargs]):\n raise ValueError(f\"Unknown column(s): {[k for k in kwargs if k not in T.columns]}\")\n\n mask = np.full((len(T),), True)\n for k, v in kwargs.items():\n col = T[k]\n for start, end, page in col.iter_by_page():\n data = page.get()\n if callable(v):\n vf = np.frompyfunc(v, 1, 1)\n mask[start:end] = mask[start:end] & np.apply_along_axis(vf, 0, data)\n else:\n mask[start:end] = mask[start:end] & (data == v)\n\n return _compress_one(T, mask)\n "},{"location":"reference/redux/#tablite.redux.drop","title":"tablite.redux.drop(T, *args) ","text":"drops all rows that contain args PARAMETER DESCRIPTION T TYPE: Table Source code in tablite/redux.py def drop(T, *args):\n \"\"\"drops all rows that contain args\n\n Args:\n T (Table):\n \"\"\"\n sub_cls_check(T, BaseTable)\n mask = np.full((len(T),), False)\n for name in T.columns:\n col = T[name]\n for start, end, page in col.iter_by_page():\n data = page.get()\n for arg in args:\n mask[start:end] = mask[start:end] | (data == arg)\n\n mask = np.invert(mask)\n return _compress_one(T, mask)\n "},{"location":"reference/redux/#tablite.redux.filter_any","title":"tablite.redux.filter_any(T, **kwargs) ","text":"returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable Source code in tablite/redux.py def filter_any(T, **kwargs):\n \"\"\"\n returns Table for rows where ANY kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n \"\"\"\n sub_cls_check(T, BaseTable)\n if not isinstance(kwargs, dict):\n raise TypeError(\"did you forget to add the ** in front of your dict?\")\n\n mask = np.full((len(T),), False)\n for k, v in kwargs.items():\n col = T[k]\n for start, end, page in col.iter_by_page():\n data = page.get()\n if callable(v):\n vf = np.frompyfunc(v, 1, 1)\n mask[start:end] = mask[start:end] | np.apply_along_axis(vf, 0, data)\n else:\n mask[start:end] = mask[start:end] | (v == data)\n\n return _compress_one(T, mask)\n "},{"location":"reference/redux/#tablite.redux.filter_non_primitive","title":"tablite.redux.filter_non_primitive(T, expressions, filter_type='all', tqdm=_tqdm) ","text":"OBSOLETE filters table PARAMETER DESCRIPTION T Table. TYPE: Table subclass expressions str: filters based on an expression, such as: \"all((A==B, C!=4, 200<D))\" which is interpreted using python's compiler to: def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n list of dicts: (example): L = [ {'column1':'A', 'criteria': \"==\", 'column2': 'B'}, {'column1':'C', 'criteria': \"!=\", \"value2\": '4'}, {'value1': 200, 'criteria': \"<\", column2: 'D' } ] TYPE: list or str accepted 'column1', 'column2', 'criteria', 'value1', 'value2' TYPE: dictionary keys filter_type Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\". TYPE: str DEFAULT: 'all' tqdm progressbar. Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION 2xTables trues, falses Source code in tablite/redux.py def filter_non_primitive(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n \"\"\"\n OBSOLETE\n filters table\n\n\n Args:\n T (Table subclass): Table.\n expressions (list or str):\n str:\n filters based on an expression, such as:\n \"all((A==B, C!=4, 200<D))\"\n which is interpreted using python's compiler to:\n\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n\n list of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\n accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n filter_type (str, optional): Ignored if expressions is str.\n 'all' or 'any'. Defaults to \"all\".\n tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n Returns:\n 2xTables: trues, falses\n \"\"\"\n # determine method\n warnings.warn(\"Filter using non-primitive types is not recommended.\")\n sub_cls_check(T, BaseTable)\n if len(T) == 0:\n return T.copy(), T.copy()\n\n with tqdm(desc=\"filter\", total=20) as pbar:\n if isinstance(expressions, str):\n mask = _filter_using_expression(T, expressions)\n pbar.update(10)\n elif isinstance(expressions, list):\n mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar)\n else:\n raise TypeError\n # create new tables\n res = _compress_both(T, mask, pbar=pbar)\n pbar.update(pbar.total - pbar.n)\n\n return res\n "},{"location":"reference/redux/#tablite.redux.filter","title":"tablite.redux.filter(T, expressions, filter_type='all', tqdm=_tqdm) ","text":"filters table Note: At the moment only tablite primitive types are supported PARAMETER DESCRIPTION T Table. TYPE: Table subclass expressions str: filters based on an expression, such as: \"all((A==B, C!=4, 200<D))\" which is interpreted using python's compiler to: def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n list of dicts: (example): L = [ {'column1':'A', 'criteria': \"==\", 'column2': 'B'}, {'column1':'C', 'criteria': \"!=\", \"value2\": '4'}, {'value1': 200, 'criteria': \"<\", column2: 'D' } ] TYPE: list or str accepted 'column1', 'column2', 'criteria', 'value1', 'value2' TYPE: dictionary keys filter_type Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\". TYPE: str DEFAULT: 'all' tqdm progressbar. Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION 2xTables trues, falses Source code in tablite/redux.py def filter(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n \"\"\"filters table\n Note: At the moment only tablite primitive types are supported\n\n Args:\n T (Table subclass): Table.\n expressions (list or str):\n str:\n filters based on an expression, such as:\n \"all((A==B, C!=4, 200<D))\"\n which is interpreted using python's compiler to:\n\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n\n list of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\n accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n filter_type (str, optional): Ignored if expressions is str.\n 'all' or 'any'. Defaults to \"all\".\n tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n Returns:\n 2xTables: trues, falses\n \"\"\"\n # determine method\n sub_cls_check(T, BaseTable)\n if len(T) == 0:\n return T.copy(), T.copy()\n\n if isinstance(expressions, str):\n with tqdm(desc=\"filter\", total=20) as pbar:\n # TODO: make parser for expressions and use the nim implement\n mask = _filter_using_expression(T, expressions)\n pbar.update(10)\n res = _compress_both(T, mask, pbar=pbar)\n pbar.update(pbar.total - pbar.n)\n elif isinstance(expressions, list):\n return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)\n else:\n raise TypeError\n # create new tables\n\n return res\n "},{"location":"reference/reindex/","title":"Reindex","text":""},{"location":"reference/reindex/#tablite.reindex","title":"tablite.reindex ","text":""},{"location":"reference/reindex/#tablite.reindex-classes","title":"Classes","text":""},{"location":"reference/reindex/#tablite.reindex-functions","title":"Functions","text":""},{"location":"reference/reindex/#tablite.reindex.reindex","title":"tablite.reindex.reindex(T, index, names=None, tqdm=_tqdm, pbar=None) ","text":"Constant Memory helper for reindexing pages. Memory usage is set by datatype and Config.PAGE_SIZE PARAMETER DESCRIPTION T subclass of Table TYPE: Table index int64. TYPE: array names list of names from T to reindex. TYPE: (list, str) DEFAULT: None tqdm Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm pbar Defaults to None. TYPE: pbar DEFAULT: None RETURNS DESCRIPTION _type_ description Source code in tablite/reindex.py def reindex(T, index, names=None, tqdm=_tqdm, pbar=None):\n \"\"\"Constant Memory helper for reindexing pages.\n\n Memory usage is set by datatype and Config.PAGE_SIZE\n\n Args:\n T (Table): subclass of Table\n index (np.array): int64.\n names (list, str): list of names from T to reindex.\n tqdm (tqdm, optional): Defaults to _tqdm.\n pbar (pbar, optional): Defaults to None.\n\n Returns:\n _type_: _description_\n \"\"\"\n if names is None:\n names = list(T.columns.keys())\n\n if pbar is None:\n total = len(names)\n pbar = tqdm(total=total, desc=\"join\", disable=Config.TQDM_DISABLE)\n\n sub_cls_check(T, BaseTable)\n cls = type(T)\n result = cls()\n for name in names:\n result.add_column(name)\n col = result[name]\n\n for start, end in Config.page_steps(len(index)):\n indices = index[start:end]\n values = T[name].get_by_indices(indices)\n # in these values, the index of -1 will be wrong.\n # so if there is any -1 in the indices, they will\n # have to be replaced with Nones\n mask = indices == -1\n if np.any(mask):\n nones = np.full(index.shape, fill_value=None)\n values = np.where(mask, nones, values)\n col.extend(values)\n pbar.update(1)\n\n return result\n "},{"location":"reference/sort_utils/","title":"Sort utils","text":""},{"location":"reference/sort_utils/#tablite.sort_utils","title":"tablite.sort_utils ","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-attributes","title":"Attributes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.uca_collator","title":"tablite.sort_utils.uca_collator = Collator() module-attribute ","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.modes","title":"tablite.sort_utils.modes = {'alphanumeric': text_sort, 'unix': unix_sort, 'excel': excel_sort} module-attribute ","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-classes","title":"Classes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict","title":"tablite.sort_utils.HashDict ","text":" Bases: dict This class is just a nicity syntatic sugar for debugging. Function identically to regular dictionary, just uses tupled key. "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.items","title":"tablite.sort_utils.HashDict.items() ","text":"Source code in tablite/sort_utils.py def items(self):\n return [(k, v) for (_, k), v in super().items()]\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.keys","title":"tablite.sort_utils.HashDict.keys() ","text":"Source code in tablite/sort_utils.py def keys(self):\n return [k for (_, k) in super().keys()]\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__iter__","title":"tablite.sort_utils.HashDict.__iter__() -> Iterator ","text":"Source code in tablite/sort_utils.py def __iter__(self) -> Iterator:\n return (k for (_, k) in super().keys())\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__getitem__","title":"tablite.sort_utils.HashDict.__getitem__(key) ","text":"Source code in tablite/sort_utils.py def __getitem__(self, key):\n return super().__getitem__(self._get_hash(key))\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__setitem__","title":"tablite.sort_utils.HashDict.__setitem__(key, value) ","text":"Source code in tablite/sort_utils.py def __setitem__(self, key, value):\n return super().__setitem__(self._get_hash(key), value)\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__contains__","title":"tablite.sort_utils.HashDict.__contains__(key) -> bool ","text":"Source code in tablite/sort_utils.py def __contains__(self, key) -> bool:\n return super().__contains__(self._get_hash(key))\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__delitem__","title":"tablite.sort_utils.HashDict.__delitem__(key) ","text":"Source code in tablite/sort_utils.py def __delitem__(self, key):\n return super().__delitem__(self._get_hash(key))\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__repr__","title":"tablite.sort_utils.HashDict.__repr__() -> str ","text":"Source code in tablite/sort_utils.py def __repr__(self) -> str:\n return '{' + \", \".join([f\"{k}: {v}\" for k, v in self.items()]) + '}'\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__str__","title":"tablite.sort_utils.HashDict.__str__() -> str ","text":"Source code in tablite/sort_utils.py def __str__(self) -> str:\n return repr(self)\n "},{"location":"reference/sort_utils/#tablite.sort_utils-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.text_sort","title":"tablite.sort_utils.text_sort(values, reverse=False) ","text":"Sorts everything as text. Source code in tablite/sort_utils.py def text_sort(values, reverse=False):\n \"\"\"\n Sorts everything as text.\n \"\"\"\n text = {str(i): i for i in values}\n L = list(text.keys())\n L.sort(key=uca_collator.sort_key, reverse=reverse)\n d = {text[value]: ix for ix, value in enumerate(L)}\n return d\n "},{"location":"reference/sort_utils/#tablite.sort_utils.unix_sort","title":"tablite.sort_utils.unix_sort(values, reverse=False) ","text":"Unix sortation sorts by the following order: | rank | type | value | +------+-----------+--------------------------------------------+ | 0 | None | floating point -infinite | | 1 | bool | 0 as False, 1 as True | | 2 | int | as numeric value | | 2 | float | as numeric value | | 3 | time | \u03c4 * seconds into the day / (24 * 60 * 60) | | 4 | date | as integer days since 1970/1/1 | | 5 | datetime | as float using date (int) + time (decimal) | | 6 | timedelta | as float using date (int) + time (decimal) | | 7 | str | using unicode | +------+-----------+--------------------------------------------+ \u03c4 = 2 * \u03c0 Source code in tablite/sort_utils.py def unix_sort(values, reverse=False):\n \"\"\"\n Unix sortation sorts by the following order:\n\n | rank | type | value |\n +------+-----------+--------------------------------------------+\n | 0 | None | floating point -infinite |\n | 1 | bool | 0 as False, 1 as True |\n | 2 | int | as numeric value |\n | 2 | float | as numeric value |\n | 3 | time | \u03c4 * seconds into the day / (24 * 60 * 60) |\n | 4 | date | as integer days since 1970/1/1 |\n | 5 | datetime | as float using date (int) + time (decimal) |\n | 6 | timedelta | as float using date (int) + time (decimal) |\n | 7 | str | using unicode |\n +------+-----------+--------------------------------------------+\n\n \u03c4 = 2 * \u03c0\n\n \"\"\"\n text, non_text = [], []\n\n # L = []\n # text = [i for i in values if isinstance(i, str)]\n # text.sort(key=uca_collator.sort_key, reverse=reverse)\n # text_code = _unix_typecodes[str]\n # L = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n for value in values:\n if isinstance(value, str):\n text.append(value)\n else:\n t = type(value)\n TC = _unix_typecodes[t]\n tf = _unix_value_function[t]\n VC = tf(value)\n non_text.append((TC, VC, value))\n non_text.sort(reverse=reverse)\n\n text.sort(key=uca_collator.sort_key, reverse=reverse)\n text_code = _unix_typecodes[str]\n text = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n d = HashDict()\n L = non_text + text\n for ix, (_, _, value) in enumerate(L):\n d[value] = ix\n return d\n "},{"location":"reference/sort_utils/#tablite.sort_utils.excel_sort","title":"tablite.sort_utils.excel_sort(values, reverse=False) ","text":"Excel sortation sorts by the following order: | rank | type | value | +------+-----------+--------------------------------------------+ | 1 | int | as numeric value | | 1 | float | as numeric value | | 1 | time | as seconds into the day / (24 * 60 * 60) | | 1 | date | as integer days since 1900/1/1 | | 1 | datetime | as float using date (int) + time (decimal) | | (1)*| timedelta | as float using date (int) + time (decimal) | | 2 | str | using unicode | | 3 | bool | 0 as False, 1 as True | | 4 | None | floating point infinite. | +------+-----------+--------------------------------------------+ - Excel doesn't have timedelta.
Source code in tablite/sort_utils.py def excel_sort(values, reverse=False):\n \"\"\"\n Excel sortation sorts by the following order:\n\n | rank | type | value |\n +------+-----------+--------------------------------------------+\n | 1 | int | as numeric value |\n | 1 | float | as numeric value |\n | 1 | time | as seconds into the day / (24 * 60 * 60) |\n | 1 | date | as integer days since 1900/1/1 |\n | 1 | datetime | as float using date (int) + time (decimal) |\n | (1)*| timedelta | as float using date (int) + time (decimal) |\n | 2 | str | using unicode |\n | 3 | bool | 0 as False, 1 as True |\n | 4 | None | floating point infinite. |\n +------+-----------+--------------------------------------------+\n\n * Excel doesn't have timedelta.\n \"\"\"\n\n def tup(TC, value):\n return (TC, _excel_value_function[t](value), value)\n\n text, numeric, booles, nones = [], [], [], []\n for value in values:\n t = type(value)\n TC = _excel_typecodes[t]\n\n if TC == 0:\n numeric.append(tup(TC, value))\n elif TC == 1:\n text.append(value) # text is processed later.\n elif TC == 2:\n booles.append(tup(TC, value))\n elif TC == 3:\n booles.append(tup(TC, value))\n else:\n raise TypeError(f\"no typecode for {value}\")\n\n if text:\n text.sort(key=uca_collator.sort_key, reverse=reverse)\n text = [(2, ix, v) for ix, v in enumerate(text)]\n\n numeric.sort(reverse=reverse)\n booles.sort(reverse=reverse)\n nones.sort(reverse=reverse)\n\n if reverse:\n L = nones + booles + text + numeric\n else:\n L = numeric + text + booles + nones\n d = {value: ix for ix, (_, _, value) in enumerate(L)}\n return d\n "},{"location":"reference/sort_utils/#tablite.sort_utils.rank","title":"tablite.sort_utils.rank(values, reverse, mode) ","text":"values: list of values to sort. reverse: bool mode: as 'text', as 'numeric' or as 'excel' return: dict: d[value] = rank Source code in tablite/sort_utils.py def rank(values, reverse, mode):\n \"\"\"\n values: list of values to sort.\n reverse: bool\n mode: as 'text', as 'numeric' or as 'excel'\n return: dict: d[value] = rank\n \"\"\"\n if mode not in modes:\n raise ValueError(f\"{mode} not in list of modes: {list(modes)}\")\n f = modes.get(mode)\n return f(values, reverse)\n "},{"location":"reference/sortation/","title":"Sortation","text":""},{"location":"reference/sortation/#tablite.sortation","title":"tablite.sortation ","text":""},{"location":"reference/sortation/#tablite.sortation-attributes","title":"Attributes","text":""},{"location":"reference/sortation/#tablite.sortation-classes","title":"Classes","text":""},{"location":"reference/sortation/#tablite.sortation-functions","title":"Functions","text":""},{"location":"reference/sortation/#tablite.sortation.sort_index","title":"tablite.sortation.sort_index(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar=None) ","text":"helper for methods sort and is_sorted param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort() Source code in tablite/sortation.py def sort_index(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar=None):\n \"\"\"\n helper for methods `sort` and `is_sorted`\n\n param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n param: **kwargs: sort criteria. See Table.sort()\n \"\"\"\n\n sub_cls_check(T, BaseTable)\n\n if not isinstance(mapping, dict) or not mapping:\n raise TypeError(\"Expected mapping (dict)?\")\n\n for k, v in mapping.items():\n if k not in T.columns:\n raise ValueError(f\"no column {k}\")\n if not isinstance(v, bool):\n raise ValueError(f\"{k} was mapped to {v} - a non-boolean\")\n\n if sort_mode not in sort_modes:\n raise ValueError(f\"{sort_mode} not in list of sort_modes: {list(sort_modes)}\")\n\n rank = {i: tuple() for i in range(len(T))} # create index and empty tuple for sortation.\n\n _pbar = tqdm(total=len(mapping.items()), desc=\"creating sort index\") if pbar is None else pbar\n\n for key, reverse in mapping.items():\n col = T[key][:]\n ranks = sort_rank(values=[numpy_to_python(v) for v in multitype_set(col)], reverse=reverse, mode=sort_mode)\n assert isinstance(ranks, dict)\n for ix, v in enumerate(col):\n v2 = numpy_to_python(v)\n rank[ix] += (ranks[v2],) # add tuple for each sortation level.\n\n _pbar.update(1)\n\n del col\n del ranks\n\n new_order = [(r, i) for i, r in rank.items()] # tuples are listed and sort...\n del rank # free memory.\n\n new_order.sort()\n sorted_index = [i for _, i in new_order] # new index is extracted.\n new_order.clear()\n return np.array(sorted_index, dtype=np.int64)\n "},{"location":"reference/sortation/#tablite.sortation.reindex","title":"tablite.sortation.reindex(T, index) ","text":"index: list of integers that declare sort order. Examples: Table: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n Source code in tablite/sortation.py def reindex(T, index):\n \"\"\"\n index: list of integers that declare sort order.\n\n Examples:\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6]\n result: ['b','d','f','h']\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6,1,3,5,7]\n result: ['a','c','e','g','b','d','f','h']\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n if isinstance(index, list):\n index = np.array(index, dtype=int)\n type_check(index, np.ndarray)\n if max(index) >= len(T):\n raise IndexError(\"index out of range: max(index) > len(self)\")\n if min(index) < -len(T):\n raise IndexError(\"index out of range: min(index) < -len(self)\")\n\n fields = len(T) * len(T.columns)\n m = select_processing_method(fields, _reindex, _mp_reindex)\n return m(T, index)\n "},{"location":"reference/sortation/#tablite.sortation.sort","title":"tablite.sortation.sort(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None) ","text":"Perform multi-pass sorting with precedence given order of column names. sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" kwargs: keys: columns, values: 'reverse' as boolean. examples: Table.sort('A'=False) means sort by 'A' in ascending order. Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority) sort B in ascending order. Source code in tablite/sortation.py def sort(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n \"\"\"Perform multi-pass sorting with precedence given order of column names.\n sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n kwargs:\n keys: columns,\n values: 'reverse' as boolean.\n\n examples:\n Table.sort('A'=False) means sort by 'A' in ascending order.\n Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority)\n sort B in ascending order.\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n index = sort_index(T, mapping, sort_mode=sort_mode, tqdm=_tqdm, pbar=pbar)\n m = select_processing_method(len(T) * len(T.columns), _sp_reindex, _mp_reindex)\n return m(T, index, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/sortation/#tablite.sortation.is_sorted","title":"tablite.sortation.is_sorted(T, mapping, sort_mode='excel') ","text":"Performs multi-pass sorting check with precedence given order of column names. PARAMETER DESCRIPTION mapping sort criteria. See Table.sort() RETURNS DESCRIPTION bool Source code in tablite/sortation.py def is_sorted(T, mapping, sort_mode=\"excel\"):\n \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n\n Args:\n mapping: sort criteria. See Table.sort()\n sort_mode = sort mode. See Table.sort()\n\n Returns:\n bool\n \"\"\"\n index = sort_index(T, mapping, sort_mode=sort_mode)\n match = np.arange(len(T))\n return np.all(index == match)\n "},{"location":"reference/tools/","title":"Tools","text":""},{"location":"reference/tools/#tablite.tools","title":"tablite.tools ","text":""},{"location":"reference/tools/#tablite.tools-attributes","title":"Attributes","text":""},{"location":"reference/tools/#tablite.tools.guess","title":"tablite.tools.guess = DataTypes.guess module-attribute ","text":""},{"location":"reference/tools/#tablite.tools.xround","title":"tablite.tools.xround = DataTypes.round module-attribute ","text":""},{"location":"reference/tools/#tablite.tools-classes","title":"Classes","text":""},{"location":"reference/tools/#tablite.tools-functions","title":"Functions","text":""},{"location":"reference/tools/#tablite.tools.head","title":"tablite.tools.head(path, linecount=5, delimiter=None) ","text":"Gets the head of any supported file format. Source code in tablite/tools.py def head(path, linecount=5, delimiter=None):\n \"\"\"\n Gets the head of any supported file format.\n \"\"\"\n return get_headers(path, linecount=linecount, delimiter=delimiter)\n "},{"location":"reference/utils/","title":"Utils","text":""},{"location":"reference/utils/#tablite.utils","title":"tablite.utils ","text":""},{"location":"reference/utils/#tablite.utils-attributes","title":"Attributes","text":""},{"location":"reference/utils/#tablite.utils.letters","title":"tablite.utils.letters = string.ascii_lowercase + string.digits module-attribute ","text":""},{"location":"reference/utils/#tablite.utils.NoneType","title":"tablite.utils.NoneType = type(None) module-attribute ","text":""},{"location":"reference/utils/#tablite.utils.required_keys","title":"tablite.utils.required_keys = {'min', 'max', 'mean', 'median', 'stdev', 'mode', 'distinct', 'iqr_low', 'iqr_high', 'iqr', 'sum', 'summary type', 'histogram'} module-attribute ","text":""},{"location":"reference/utils/#tablite.utils.summary_methods","title":"tablite.utils.summary_methods = {bool: _boolean_statistics_summary, int: _numeric_statistics_summary, float: _numeric_statistics_summary, str: _string_statistics_summary, date: _date_statistics_summary, datetime: _datetime_statistics_summary, time: _time_statistics_summary, timedelta: _timedelta_statistics_summary, type(None): _none_type_summary} module-attribute ","text":""},{"location":"reference/utils/#tablite.utils-classes","title":"Classes","text":""},{"location":"reference/utils/#tablite.utils-functions","title":"Functions","text":""},{"location":"reference/utils/#tablite.utils.generate_random_string","title":"tablite.utils.generate_random_string(len) ","text":"Source code in tablite/utils.py def generate_random_string(len):\n return \"\".join(random.choice(letters) for i in range(len))\n "},{"location":"reference/utils/#tablite.utils.type_check","title":"tablite.utils.type_check(var, kind) ","text":"Source code in tablite/utils.py def type_check(var, kind):\n if not isinstance(var, kind):\n raise TypeError(f\"Expected {kind}, not {type(var)}\")\n "},{"location":"reference/utils/#tablite.utils.sub_cls_check","title":"tablite.utils.sub_cls_check(c, kind) ","text":"Source code in tablite/utils.py def sub_cls_check(c, kind):\n if not issubclass(type(c), kind):\n raise TypeError(f\"Expected {kind}, not {type(c)}\")\n "},{"location":"reference/utils/#tablite.utils.name_check","title":"tablite.utils.name_check(options, *names) ","text":"Source code in tablite/utils.py def name_check(options, *names):\n for n in names:\n if n not in options:\n raise ValueError(f\"{n} not in {options}\")\n "},{"location":"reference/utils/#tablite.utils.unique_name","title":"tablite.utils.unique_name(wanted_name, set_of_names) ","text":"returns a wanted_name as wanted_name_i given a list of names which guarantees unique naming. Source code in tablite/utils.py def unique_name(wanted_name, set_of_names):\n \"\"\"\n returns a wanted_name as wanted_name_i given a list of names\n which guarantees unique naming.\n \"\"\"\n if not isinstance(set_of_names, set):\n set_of_names = set(set_of_names)\n name, i = wanted_name, 1\n while name in set_of_names:\n name = f\"{wanted_name}_{i}\"\n i += 1\n return name\n "},{"location":"reference/utils/#tablite.utils.expression_interpreter","title":"tablite.utils.expression_interpreter(expression, columns) ","text":"Interprets valid expressions such as: \"all((A==B, C!=4, 200<D))\"\n as def _f(A,B,C,D): return all((A==B, C!=4, 200<D)) using python's compiler. Source code in tablite/utils.py def expression_interpreter(expression, columns):\n \"\"\"\n Interprets valid expressions such as:\n\n \"all((A==B, C!=4, 200<D))\"\n\n as:\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n\n using python's compiler.\n \"\"\"\n if not isinstance(expression, str):\n raise TypeError(f\"`{expression}` is not a str\")\n if not isinstance(columns, list):\n raise TypeError\n if not all(isinstance(i, str) for i in columns):\n raise TypeError\n\n req_columns = \", \".join(i for i in columns if i in expression)\n script = f\"def f({req_columns}):\\n return {expression}\"\n tree = ast.parse(script)\n code = compile(tree, filename=\"blah\", mode=\"exec\")\n namespace = {}\n exec(code, namespace)\n f = namespace[\"f\"]\n if not callable(f):\n raise ValueError(f\"The expression could not be parse: {expression}\")\n return f\n "},{"location":"reference/utils/#tablite.utils.intercept","title":"tablite.utils.intercept(A, B) ","text":"Enables calculation of the intercept of two range objects. Used to determine if a datablock contains a slice. PARAMETER DESCRIPTION A range B range RETURNS DESCRIPTION range The intercept of ranges A and B. Source code in tablite/utils.py def intercept(A, B):\n \"\"\"Enables calculation of the intercept of two range objects.\n Used to determine if a datablock contains a slice.\n\n Args:\n A: range\n B: range\n\n Returns:\n range: The intercept of ranges A and B.\n \"\"\"\n type_check(A, range)\n type_check(B, range)\n\n if A.step < 1:\n A = range(A.stop + 1, A.start + 1, 1)\n if B.step < 1:\n B = range(B.stop + 1, B.start + 1, 1)\n\n if len(A) == 0:\n return range(0)\n if len(B) == 0:\n return range(0)\n\n if A.stop <= B.start:\n return range(0)\n if A.start >= B.stop:\n return range(0)\n\n if A.start <= B.start:\n if A.stop <= B.stop:\n start, end = B.start, A.stop\n elif A.stop > B.stop:\n start, end = B.start, B.stop\n else:\n raise ValueError(\"bad logic\")\n elif A.start < B.stop:\n if A.stop <= B.stop:\n start, end = A.start, A.stop\n elif A.stop > B.stop:\n start, end = A.start, B.stop\n else:\n raise ValueError(\"bad logic\")\n else:\n raise ValueError(\"bad logic\")\n\n a_steps = math.ceil((start - A.start) / A.step)\n a_start = (a_steps * A.step) + A.start\n\n b_steps = math.ceil((start - B.start) / B.step)\n b_start = (b_steps * B.step) + B.start\n\n if A.step == 1 or B.step == 1:\n start = max(a_start, b_start)\n step = max(A.step, B.step)\n return range(start, end, step)\n elif A.step == B.step:\n a, b = min(A.start, B.start), max(A.start, B.start)\n if (b - a) % A.step != 0: # then the ranges are offset.\n return range(0)\n else:\n return range(b, end, step)\n else:\n # determine common step size:\n step = max(A.step, B.step) if math.gcd(A.step, B.step) != 1 else A.step * B.step\n # examples:\n # 119 <-- 17 if 1 != 1 else 119 <-- max(7, 17) if math.gcd(7, 17) != 1 else 7 * 17\n # 30 <-- 30 if 3 != 1 else 90 <-- max(3, 30) if math.gcd(3, 30) != 1 else 3*30\n if A.step < B.step:\n for n in range(a_start, end, A.step): # increment in smallest step to identify the first common value.\n if n < b_start:\n continue\n elif (n - b_start) % B.step == 0:\n return range(n, end, step) # common value found.\n else:\n for n in range(b_start, end, B.step):\n if n < a_start:\n continue\n elif (n - a_start) % A.step == 0:\n return range(n, end, step)\n\n return range(0)\n "},{"location":"reference/utils/#tablite.utils.summary_statistics","title":"tablite.utils.summary_statistics(values, counts) ","text":"values: any type counts: integer returns dict with: - min (int/float, length of str, date) - max (int/float, length of str, date) - mean (int/float, length of str, date) - median (int/float, length of str, date) - stdev (int/float, length of str, date) - mode (int/float, length of str, date) - distinct (number of distinct values) - iqr (int/float, length of str, date) - sum (int/float, length of str, date) - histogram (2 arrays: values, count of each values) Source code in tablite/utils.py def summary_statistics(values, counts):\n \"\"\"\n values: any type\n counts: integer\n\n returns dict with:\n - min (int/float, length of str, date)\n - max (int/float, length of str, date)\n - mean (int/float, length of str, date)\n - median (int/float, length of str, date)\n - stdev (int/float, length of str, date)\n - mode (int/float, length of str, date)\n - distinct (number of distinct values)\n - iqr (int/float, length of str, date)\n - sum (int/float, length of str, date)\n - histogram (2 arrays: values, count of each values)\n \"\"\"\n # determine the dominant datatype:\n dtypes = defaultdict(int)\n most_frequent, most_frequent_dtype = 0, int\n for v, c in zip(values, counts):\n dtype = type(v)\n total = dtypes[dtype] + c\n dtypes[dtype] = total\n if total > most_frequent:\n most_frequent_dtype = dtype\n most_frequent = total\n\n if most_frequent == 0:\n return {}\n\n most_frequent_dtype = max(dtypes, key=dtypes.get)\n mask = [type(v) == most_frequent_dtype for v in values]\n v = list(compress(values, mask))\n c = list(compress(counts, mask))\n\n f = summary_methods.get(most_frequent_dtype, int)\n result = f(v, c)\n result[\"distinct\"] = len(values)\n result[\"summary type\"] = most_frequent_dtype.__name__\n result[\"histogram\"] = [values, counts]\n assert set(result.keys()) == required_keys, \"Key missing!\"\n return result\n "},{"location":"reference/utils/#tablite.utils.date_range","title":"tablite.utils.date_range(start, stop, step) ","text":"Source code in tablite/utils.py def date_range(start, stop, step):\n if not isinstance(start, datetime):\n raise TypeError(\"start is not datetime\")\n if not isinstance(stop, datetime):\n raise TypeError(\"stop is not datetime\")\n if not isinstance(step, timedelta):\n raise TypeError(\"step is not timedelta\")\n n = (stop - start) // step\n return [start + step * i for i in range(n)]\n "},{"location":"reference/utils/#tablite.utils.dict_to_rows","title":"tablite.utils.dict_to_rows(d) ","text":"Source code in tablite/utils.py def dict_to_rows(d):\n type_check(d, dict)\n rows = []\n max_length = max(len(i) for i in d.values())\n order = list(d.keys())\n rows.append(order)\n for i in range(max_length):\n row = [d[k][i] for k in order]\n rows.append(row)\n return rows\n "},{"location":"reference/utils/#tablite.utils.calc_col_count","title":"tablite.utils.calc_col_count(letters: str) ","text":"Source code in tablite/utils.py def calc_col_count(letters: str):\n ord_nil = ord(\"A\") - 1\n cols_per_letter = ord(\"Z\") - ord_nil\n col_count = 0\n\n for i, v in enumerate(reversed(letters)):\n col_count = col_count + (ord(v) - ord_nil) * pow(cols_per_letter, i)\n\n return col_count\n "},{"location":"reference/utils/#tablite.utils.calc_true_dims","title":"tablite.utils.calc_true_dims(sheet) ","text":"Source code in tablite/utils.py def calc_true_dims(sheet):\n src = sheet._get_source()\n max_col, max_row = 0, 0\n\n regex = re.compile(\"\\d+\")\n\n def handleStartElement(name, attrs):\n nonlocal max_col, max_row\n\n if name == \"c\":\n last_index = attrs[\"r\"]\n idx, _ = next(regex.finditer(last_index)).span()\n letters, digits = last_index[0:idx], int(last_index[idx:])\n\n col_idx, row_idx = calc_col_count(letters), digits\n\n max_col, max_row = max(max_col, col_idx), max(max_row, row_idx)\n\n parser = expat.ParserCreate()\n parser.buffer_text = True\n parser.StartElementHandler = handleStartElement\n parser.ParseFile(src)\n\n return max_col, max_row\n "},{"location":"reference/utils/#tablite.utils.fixup_worksheet","title":"tablite.utils.fixup_worksheet(worksheet) ","text":"Source code in tablite/utils.py def fixup_worksheet(worksheet):\n try:\n ws_cols, ws_rows = calc_true_dims(worksheet)\n\n worksheet._max_column = ws_cols\n worksheet._max_row = ws_rows\n except Exception as e:\n logging.error(f\"Failed to fetch true dimensions: {e}\")\n "},{"location":"reference/utils/#tablite.utils.update_access_time","title":"tablite.utils.update_access_time(path) ","text":"Source code in tablite/utils.py def update_access_time(path):\n path = Path(path)\n stat = path.stat()\n os.utime(path, (now(), stat.st_mtime))\n "},{"location":"reference/utils/#tablite.utils.load_numpy","title":"tablite.utils.load_numpy(path) ","text":"Source code in tablite/utils.py def load_numpy(path):\n update_access_time(path)\n\n return np.load(path, allow_pickle=True, fix_imports=False)\n "},{"location":"reference/utils/#tablite.utils.select_type_name","title":"tablite.utils.select_type_name(dtypes: dict) ","text":"Source code in tablite/utils.py def select_type_name(dtypes: dict):\n dtypes = [t for t in dtypes.items() if t[0] != NoneType]\n\n if len(dtypes) == 0:\n return \"empty\"\n\n (best_type, _), *_ = sorted(dtypes, key=lambda t: t[1], reverse=True)\n\n return best_type.__name__\n "},{"location":"reference/utils/#tablite.utils.get_predominant_types","title":"tablite.utils.get_predominant_types(table, all_dtypes=None) ","text":"Source code in tablite/utils.py def get_predominant_types(table, all_dtypes=None):\n if all_dtypes is None:\n all_dtypes = table.types()\n\n dtypes = {\n k: select_type_name(v)\n for k, v in all_dtypes.items()\n }\n\n return dtypes\n "},{"location":"reference/utils/#tablite.utils.py_to_nim_encoding","title":"tablite.utils.py_to_nim_encoding(encoding: str) -> str ","text":"Source code in tablite/utils.py def py_to_nim_encoding(encoding: str) -> str:\n if encoding is None or encoding.lower() in [\"ascii\", \"utf8\", \"utf-8\", \"utf-8-sig\"]:\n return \"ENC_UTF8\"\n elif encoding.lower() in [\"utf16\", \"utf-16\"]:\n return \"ENC_UTF16\"\n elif encoding in Config.NIM_SUPPORTED_CONV_TYPES:\n return f\"ENC_CONV|{encoding}\"\n\n raise NotImplementedError(f\"encoding not implemented: {encoding}\")\n "},{"location":"reference/version/","title":"Version","text":""},{"location":"reference/version/#tablite.version","title":"tablite.version ","text":""},{"location":"reference/version/#tablite.version-attributes","title":"Attributes","text":""},{"location":"reference/version/#tablite.version.__version_info__","title":"tablite.version.__version_info__ = (major, minor, patch) module-attribute ","text":""},{"location":"reference/version/#tablite.version.__version__","title":"tablite.version.__version__ = '.'.join(str(i) for i in __version_info__) module-attribute ","text":""}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Tablite","text":""},{"location":"#contents","title":"Contents","text":" - introduction
- installation
- feature overview
- api
- tutorial
- latest updates
- credits
"},{"location":"#introduction","title":"Introduction","text":"Tablite seeks to be the go-to library for manipulating tabular data with an api that is as close in syntax to pure python as possible. "},{"location":"#even-smaller-memory-footprint","title":"Even smaller memory footprint","text":"Tablite uses numpys fileformat as a backend with strong abstraction, so that copy, append & repetition of data is handled in pages. This is imperative for incremental data processing. Tablite tests for memory footprint. One test compares the memory footprint of 10,000,000 integers where tablite will use < 1 Mb RAM in contrast to python which will require around 133.7 Mb of RAM (1M lists with 10 integers). Tablite also tests to assure that working with 1Tb of data is tolerable. Tablite achieves this minimal memory footprint by using a temporary storage set in config.Config.workdir as tempfile.gettempdir()/tablite-tmp . If your OS (windows/linux/mac) sits on a SSD this will benefit from high IOPS and permit slices of 9,000,000,000 rows in less than a second. "},{"location":"#multiprocessing-enabled-by-default","title":"Multiprocessing enabled by default","text":"Tablite uses numpy whereever possible and applies multiprocessing for bypassing the GIL on all major operations. CSV import is performed in C through using nim s compiler and is as fast the hardware allows. "},{"location":"#all-algorithms-have-been-reworked-to-respect-memory-limits","title":"All algorithms have been reworked to respect memory limits","text":"Tablite respects the limits of free memory by tagging the free memory and defining task size before each memory intensive task is initiated (join, groupby, data import, etc). If you still run out of memory you may try to reduce the config.Config.PAGE_SIZE and rerun your program. "},{"location":"#100-support-for-all-python-datatypes","title":"100% support for all python datatypes","text":"Tablite wants to make it easy for you to work with data. tablite.Table's behave like a dict with lists: my_table[column name] = [... data ...] . Tablite uses datatype mapping to native numpy types where possible and uses type mapping for non-native types such as timedelta, None, date, time\u2026 e.g. what you put in, is what you get out. This is inspired by bank python. "},{"location":"#light-weight","title":"Light weight","text":"Tablite is ~200 kB. "},{"location":"#helpful","title":"Helpful","text":"Tablite wants you to be productive, so a number of helpers are available. Table.import_file to import csv*, tsv, txt, xls, xlsx, xlsm, ods, zip and logs. There is automatic type detection (see tutorial.ipynb ) - To peek into any supported file use
get_headers which shows the first 10 rows. - Use
mytable.rows and mytable.columns to iterate over rows or columns. - Create multi-key
.index for quick lookups. - Perform multi-key
.sort , - Filter using
.any and .all to select specific rows. - use multi-key
.lookup and .join to find data across tables. - Perform
.groupby and reorganise data as a .pivot table with max, min, sum, first, last, count, unique, average, st.deviation, median and mode - Append / concatenate tables with
+= which automatically sorts out the columns - even if they're not in perfect order. - Should you tables be similar but not the identical you can use
.stack to \"stack\" tables on top of each other If you're still missing something add it to the wishlist "},{"location":"#installation","title":"Installation","text":"Get it from pypi: Install: pip install tablite Usage: >>> from tablite import Table "},{"location":"#build-test","title":"Build & test","text":"install nim >= 2.0.0 run: chmod +x ./build_nim.sh run: ./build_nim.sh Should the default nim not be your desired taste, please use nims environment manager (atlas ) and run source nim-2.0.0/activate.sh on UNIX or nim-2.0.0/activate.bat on windows. install python >= 3.8\npython -m venv /your/venv/dir\nactivate /your/venv/dir\npip install -r requirements.txt\npip install -r requirements_for_testing.py\npytest ./tests\n "},{"location":"#feature-overview","title":"Feature overview","text":"want to... this way... loop over rows [ row for row in table.rows ] loop over columns [ table[col_name] for col_name in table.columns ] slice myslice = table['A', 'B', slice(0,None,15)] get column by name my_table['A'] get row by index my_table[9_000_000_001] value update mytable['A'][2] = new value update w. list comprehension mytable['A'] = [ x*x for x in mytable['A'] if x % 2 != 0 ] join a_join = numbers.join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'], kind='left') lookup travel_plan = friends.lookup(bustable, (DataTypes.time(21, 10), \"<=\", 'time'), ('stop', \"==\", 'stop')) groupby group_by = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)]) pivot table my_pivot = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False) index indices = old_table.index(*old_table.columns) sort lookup1_sorted = lookup_1.sort(**{'time': True, 'name':False, \"sort_mode\":'unix'}) filter true, false = unfiltered.filter( [{\"column1\": 'a', \"criteria\":\">=\", 'value2':3}, ... more criteria ... ], filter_type='all' ) find any any_even_rows = mytable.any('A': lambda x : x%2==0, 'B': lambda x > 0) find all all_even_rows = mytable.all('A': lambda x : x%2==0, 'B': lambda x > 0) to json json_str = my_table.to_json() from json Table.from_json(json_str) "},{"location":"#api","title":"API","text":"To view the detailed API see api "},{"location":"#tutorial","title":"Tutorial","text":"To learn more see the tutorial.ipynb (Jupyter notebook) "},{"location":"#latest-updates","title":"Latest updates","text":"See changelog.md "},{"location":"#credits","title":"Credits","text":" - Eugene Antonov - the api documentation.
- Audrius Kulikajevas - Edge case testing / various bugs, Jupyter notebook integration.
- Ovidijus Grigas - various bugs, documentation.
- Martynas Kaunas - GroupBy functionality.
- Sergej Sinkarenko - various bugs.
- Lori Cooper - spell checking.
"},{"location":"benchmarks/","title":"Benchmarks","text":"In\u00a0[2]: Copied! import psutil, os, gc, shutil, tempfile\nfrom pathlib import Path\nfrom time import perf_counter, time\nfrom tablite import Table\nfrom tablite.datasets import synthetic_order_data\nfrom tablite.config import Config\n\nConfig.TQDM_DISABLE = True\n import psutil, os, gc, shutil, tempfile from pathlib import Path from time import perf_counter, time from tablite import Table from tablite.datasets import synthetic_order_data from tablite.config import Config Config.TQDM_DISABLE = True In\u00a0[3]: Copied! process = psutil.Process(os.getpid())\n\ndef make_tables(sizes=[1,2,5,10,20,50]):\n # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.\n t = synthetic_order_data(Config.PAGE_SIZE)\n real, flat = t.nbytes()\n print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\")\n\n tables = [t] # 1M rows.\n\n last = 1\n t2 = t.copy()\n for i in sizes[1:]:\n t2 = t2.copy()\n for _ in range(i-last):\n t2 += synthetic_order_data(Config.PAGE_SIZE) # these are all unique\n last = i\n real, flat = t2.nbytes()\n tables.append(t2)\n print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\")\n return tables\n\ntables = make_tables()\n process = psutil.Process(os.getpid()) def make_tables(sizes=[1,2,5,10,20,50]): # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them. t = synthetic_order_data(Config.PAGE_SIZE) real, flat = t.nbytes() print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\") tables = [t] # 1M rows. last = 1 t2 = t.copy() for i in sizes[1:]: t2 = t2.copy() for _ in range(i-last): t2 += synthetic_order_data(Config.PAGE_SIZE) # these are all unique last = i real, flat = t2.nbytes() tables.append(t2) print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\") return tables tables = make_tables() Table 1,000,000 rows is 256 Mb on disk\nTable 2,000,000 rows is 512 Mb on disk\nTable 5,000,000 rows is 1,280 Mb on disk\nTable 10,000,000 rows is 2,560 Mb on disk\nTable 20,000,000 rows is 5,120 Mb on disk\nTable 50,000,000 rows is 12,800 Mb on disk\n The values in the tables above are all unique! In\u00a0[4]: Copied! tables[-1]\n tables[-1] Out[4]: ~#1234567891011 0114014953182952021-10-06T00:00:0050814119375C3-4HGQ21\u00b0XYZ1.244647268201734421.367107051830455 129320231372182021-08-26T00:00:005007718568C5-5FZU0\u00b00.55294485347516132.6980406874392537 2312569602250812021-12-21T00:00:0050197029074C2-3GTK6\u00b0XYZ1.99739754559065617.513164305723787 3414012777817432021-08-23T00:00:0050818024969C4-3BYP6\u00b0XYZ0.047497125538289577.388171617130485 459426667674262021-07-31T00:00:0050307113074C5-2CCC21\u00b0ABC1.0219215027612885.21324123446987 5612186131851272021-12-01T00:00:0050484117249C5-4WGT21\u00b00.2038764258434556712.190974436133764 676070424343982021-11-29T00:00:0050578011564C2-3LUL0\u00b0XYZ2.2367835158480444.340628097363572.......................................49,999,9939999946602693775472021-09-17T00:00:005015409706C4-3AHQ21\u00b0XYZ0.083216645843125856.56780297752790549,999,9949999955709798646952021-08-01T00:00:0050149125006C1-2FWH6\u00b01.04763923662266419.50710544462706549,999,9959999963551956078252021-07-29T00:00:0050007026992C4-3GVG21\u00b02.20440816560941411.2706443974284949,999,99699999720762240577282021-10-16T00:00:0050950113339C5-4NKS0\u00b02.1593110498135494.21575620046596149,999,9979999986577247891352021-12-21T00:00:0050069114747C2-4LYGNone1.64809640191698683.094420483625827349,999,9989999999775312438842021-12-02T00:00:0050644129345C2-5DRH6\u00b02.30911421692753110.82706867207146849,999,999100000012290713920652021-08-23T00:00:0050706119732C4-5AGB6\u00b00.488871405593691630.8580085696389939 In\u00a0[5]: Copied! def save_load_benchmarks(tables):\n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True)\n\n results = Table()\n results.add_columns('rows', 'save (sec)', 'load (sec)')\n for t in tables:\n fn = tmp / f'{len(t)}.tpz'\n start = perf_counter()\n t.save(fn)\n end = perf_counter()\n save = round(end-start,3)\n assert fn.exists()\n \n \n start = perf_counter()\n t2 = Table.load(fn)\n end = perf_counter()\n load = round(end-start,3)\n print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\")\n del t2\n fn.unlink()\n results.add_rows(len(t), save, load)\n \n r = results\n r['save r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ]\n r['load r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])]\n\n return results\n def save_load_benchmarks(tables): tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) results = Table() results.add_columns('rows', 'save (sec)', 'load (sec)') for t in tables: fn = tmp / f'{len(t)}.tpz' start = perf_counter() t.save(fn) end = perf_counter() save = round(end-start,3) assert fn.exists() start = perf_counter() t2 = Table.load(fn) end = perf_counter() load = round(end-start,3) print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\") del t2 fn.unlink() results.add_rows(len(t), save, load) r = results r['save r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ] r['load r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])] return results In\u00a0[6]: Copied! slb = save_load_benchmarks(tables)\n slb = save_load_benchmarks(tables) saving 1,000,000 rows (49 Mb) took 2.148 seconds. loading took 0.922 seconds\nsaving 2,000,000 rows (98 Mb) took 4.267 seconds. loading took 1.820 seconds\nsaving 5,000,000 rows (246 Mb) took 10.618 seconds. loading took 4.482 seconds\nsaving 10,000,000 rows (492 Mb) took 21.291 seconds. loading took 8.944 seconds\nsaving 20,000,000 rows (984 Mb) took 42.603 seconds. loading took 17.821 seconds\nsaving 50,000,000 rows (2,461 Mb) took 106.644 seconds. loading took 44.600 seconds\n In\u00a0[7]: Copied! slb\n slb Out[7]: #rowssave (sec)load (sec)save r/secload r/sec 010000002.1480.9224655491084598 120000004.2671.824687131098901 2500000010.6184.4824708981115573 31000000021.2918.9444696821118067 42000000042.60317.8214694501122271 550000000106.64444.64688491121076 With various compression options In\u00a0[8]: Copied! def save_compression_benchmarks(t):\n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True)\n\n import zipfile # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile\n methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")]\n methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)]\n methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)]\n\n results = Table()\n results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')\n for level, method, name in methods:\n fn = tmp / f'{len(t)}.tpz'\n start = perf_counter() \n t.save(fn, compression_method=method, compression_level=level)\n end = perf_counter()\n write = round(end-start,3)\n assert fn.exists()\n size = int(fn.stat().st_size/1e6)\n # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='')\n \n start = perf_counter()\n t2 = Table.load(fn)\n end = perf_counter()\n read = round(end-start,3)\n # print(f\" and {end-start:,.3} seconds to load\")\n print(\".\", end='')\n \n del t2\n fn.unlink()\n results.add_rows(size, f\"{name}(level={level})\", write, read)\n \n \n r = results\n r.sort({'write (sec)':True})\n r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]\n r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]\n return results\n def save_compression_benchmarks(t): tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) import zipfile # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")] methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)] methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)] results = Table() results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)') for level, method, name in methods: fn = tmp / f'{len(t)}.tpz' start = perf_counter() t.save(fn, compression_method=method, compression_level=level) end = perf_counter() write = round(end-start,3) assert fn.exists() size = int(fn.stat().st_size/1e6) # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='') start = perf_counter() t2 = Table.load(fn) end = perf_counter() read = round(end-start,3) # print(f\" and {end-start:,.3} seconds to load\") print(\".\", end='') del t2 fn.unlink() results.add_rows(size, f\"{name}(level={level})\", write, read) r = results r.sort({'write (sec)':True}) r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']] r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']] return results In\u00a0[9]: Copied! scb = save_compression_benchmarks(tables[0])\n scb = save_compression_benchmarks(tables[0]) ..................... creating sort index: 0%| | 0/1 [00:00<?, ?it/s]\rcreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 268.92it/s]\n In\u00a0[10]: Copied! scb[0:20]\n scb[0:20] Out[10]: #file size (Mb)methodwrite (sec)read (sec)write (rps)read (rps) 0256zip stored(level=None)0.3960.47525252522105263 129zip lzma(level=None)95.1372.22810511448833 2256zip deflated(level=0)0.5350.59518691581680672 349zip deflated(level=1)2.150.9224651161084598 447zip deflated(level=2)2.2640.9124416961096491 543zip deflated(level=3)3.0490.833279761204819 644zip deflated(level=4)2.920.8623424651160092 742zip deflated(level=5)4.0340.8692478921150747 840zip deflated(level=6)8.5580.81168491250000 939zip deflated(level=7)13.6950.7787301912853471038zip deflated(level=8)56.9720.7921755212626261138zip deflated(level=9)122.6230.791815512642221229zip bzip2(level=1)15.1214.065661332460021329zip bzip2(level=2)16.0474.214623162373041429zip bzip2(level=3)16.8584.409593192268081529zip bzip2(level=4)17.6485.141566631945141629zip bzip2(level=5)18.6746.009535501664171729zip bzip2(level=6)19.4056.628515331508751829zip bzip2(level=7)19.9546.714501151489421929zip bzip2(level=8)20.5956.96148555143657 Conclusions - Fastest: zip stored with no compression takes handles
In\u00a0[11]: Copied! def to_sql_benchmark(t, rows=1_000_000):\n t2 = t[:rows]\n write_start = time()\n _ = t2.to_sql(name='1')\n write_end = time()\n write = round(write_end-write_start,3)\n return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" )\n def to_sql_benchmark(t, rows=1_000_000): t2 = t[:rows] write_start = time() _ = t2.to_sql(name='1') write_end = time() write = round(write_end-write_start,3) return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" ) In\u00a0[12]: Copied! def to_json_benchmark(t, rows=1_000_000):\n t2 = t[:rows]\n\n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True)\n path = tmp / \"1.json\" \n \n write_start = time()\n bytestr = t2.to_json()\n with path.open('w') as fo:\n fo.write(bytestr)\n write_end = time()\n write = round(write_end-write_start,3)\n\n read_start = time()\n with path.open('r') as fi:\n _ = Table.from_json(fi.read()) # <-- JSON\n read_end = time()\n read = round(read_end-read_start,3)\n\n return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" )\n def to_json_benchmark(t, rows=1_000_000): t2 = t[:rows] tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) path = tmp / \"1.json\" write_start = time() bytestr = t2.to_json() with path.open('w') as fo: fo.write(bytestr) write_end = time() write = round(write_end-write_start,3) read_start = time() with path.open('r') as fi: _ = Table.from_json(fi.read()) # <-- JSON read_end = time() read = round(read_end-read_start,3) return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" ) In\u00a0[13]: Copied! def f(t, args):\n rows, c1, c1_kw, c2, c2_kw = args\n t2 = t[:rows]\n\n call = getattr(t2, c1)\n assert callable(call)\n\n write_start = time()\n call(**c1_kw)\n write_end = time()\n write = round(write_end-write_start,3)\n\n for _ in range(10):\n gc.collect()\n\n read_start = time()\n if callable(c2):\n c2(**c2_kw)\n read_end = time()\n read = round(read_end-read_start,3)\n\n fn = c2_kw['path']\n assert fn.exists()\n fs = int(fn.stat().st_size/1e6)\n config = {k:v for k,v in c2_kw.items() if k!= 'path'}\n\n return ( c1, write, read, len(t2), fs , str(config))\n def f(t, args): rows, c1, c1_kw, c2, c2_kw = args t2 = t[:rows] call = getattr(t2, c1) assert callable(call) write_start = time() call(**c1_kw) write_end = time() write = round(write_end-write_start,3) for _ in range(10): gc.collect() read_start = time() if callable(c2): c2(**c2_kw) read_end = time() read = round(read_end-read_start,3) fn = c2_kw['path'] assert fn.exists() fs = int(fn.stat().st_size/1e6) config = {k:v for k,v in c2_kw.items() if k!= 'path'} return ( c1, write, read, len(t2), fs , str(config)) In\u00a0[14]: Copied! def import_export_benchmarks(tables):\n Config.PROCESSING_MODE = Config.FALSE\n \n t = sorted(tables, key=lambda x: len(x), reverse=True)[0]\n \n tmp = Path(tempfile.gettempdir()) / \"junk\"\n tmp.mkdir(exist_ok=True) \n\n args = [\n ( 100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}),\n ( 50_000, \"to_ods\", {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ), # 50k rows, otherwise MemoryError.\n ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'} ),\n ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n (10_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n ( 1_000_000, \"to_tsv\", {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'} ),\n ( 1_000_000, \"to_text\", {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'} ),\n ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'} ),\n ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'} )\n ]\n\n results = Table()\n results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')\n\n results.add_rows( to_sql_benchmark(t) )\n results.add_rows( to_json_benchmark(t) )\n\n for arg in args:\n if len(t)<arg[0]:\n continue\n print(\".\", end='')\n try:\n results.add_rows( f(t, arg) )\n except MemoryError:\n results.add_rows( arg[1], \"Memory Error\", \"NIL\", args[0], \"NIL\", \"N/A\")\n \n r = results\n r['read r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['read (s)']) ]\n r['write r/sec'] = [int(a/b) if b!=0 else \"nil\" for a,b in zip(r['rows'], r['write (s)'])]\n\n shutil.rmtree(tmp)\n return results\n def import_export_benchmarks(tables): Config.PROCESSING_MODE = Config.FALSE t = sorted(tables, key=lambda x: len(x), reverse=True)[0] tmp = Path(tempfile.gettempdir()) / \"junk\" tmp.mkdir(exist_ok=True) args = [ ( 100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}), ( 50_000, \"to_ods\", {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ), # 50k rows, otherwise MemoryError. ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'} ), ( 1_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}), (10_000_000, \"to_csv\", {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}), ( 1_000_000, \"to_tsv\", {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'} ), ( 1_000_000, \"to_text\", {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'} ), ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'} ), ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'} ) ] results = Table() results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config') results.add_rows( to_sql_benchmark(t) ) results.add_rows( to_json_benchmark(t) ) for arg in args: if len(t) In\u00a0[15]: Copied! ieb = import_export_benchmarks(tables)\n ieb = import_export_benchmarks(tables) .........writing 12,000,000 records to /tmp/junk/1.hdf5... done\n In\u00a0[16]: Copied! ieb\n ieb Out[16]: #methodwrite (s)read (s)rowssize (Mb)configread r/secwrite r/sec 0to_sql12.34501000000nil81004 1to_json10.8144.406100000014222696392472 2to_xlsx10.56921.5721000009{'sheet': 'pyexcel_sheet1'}46359461 3to_ods29.17529.487500003{'sheet': 'pyexcel_sheet1'}16951713 4to_csv14.31515.7311000000108{}6356869856 5to_csv14.4388.1691000000108{'guess_datatypes': False}12241469261 6to_csv140.64599.45100000001080{'guess_datatypes': False}10055371100 7to_tsv13.83415.7631000000108{}6343972285 8to_text13.93715.6821000000108{}6376771751 9to_html12.5780.531000000228{}18867927950310to_hdf55.0112.3451000000316{}81004199600 Conclusions Best: - to/from JSON wins with 2.3M rps read
- to/from CSV/TSV/TEXT comes 2nd with config
guess_datatypes=False with ~ 100k rps Worst: - to/from ods burst the memory footprint and hence had to be reduced to 100k rows. It also had the slowest read rate with 1450 rps.
In\u00a0[17]: Copied! def contains_benchmark(table):\n results = Table()\n results.add_columns( \"column\", \"time (s)\" )\n for name,col in table.columns.items():\n n = len(col)\n start,stop,step = int(n*0.02), int(n*0.98), int(n/100)\n selection = col[start:stop:step]\n total_time = 0.0\n for v in selection:\n start_time = perf_counter()\n v in col # <--- test!\n end_time = perf_counter()\n total_time += (end_time - start_time)\n avg_time = total_time / len(selection)\n results.add_rows( name, round(avg_time,3) )\n\n return results\n def contains_benchmark(table): results = Table() results.add_columns( \"column\", \"time (s)\" ) for name,col in table.columns.items(): n = len(col) start,stop,step = int(n*0.02), int(n*0.98), int(n/100) selection = col[start:stop:step] total_time = 0.0 for v in selection: start_time = perf_counter() v in col # <--- test! end_time = perf_counter() total_time += (end_time - start_time) avg_time = total_time / len(selection) results.add_rows( name, round(avg_time,3) ) return results In\u00a0[18]: Copied! has_it = contains_benchmark(tables[-1])\nhas_it\n has_it = contains_benchmark(tables[-1]) has_it Out[18]: #columntime (s) 0#0.001 110.043 220.032 330.001 440.001 550.001 660.006 770.003 880.006 990.00710100.04311110.655 In\u00a0[19]: Copied! def slicing_benchmark(table):\n n = len(table)\n start,stop,step = int(0.02*n), int(0.98*n), int(n / 20) # from 2% to 98% in 20 large steps\n start_time = perf_counter()\n snip = table[start:stop:step]\n end_time = perf_counter()\n print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\")\n return snip\n def slicing_benchmark(table): n = len(table) start,stop,step = int(0.02*n), int(0.98*n), int(n / 20) # from 2% to 98% in 20 large steps start_time = perf_counter() snip = table[start:stop:step] end_time = perf_counter() print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\") return snip In\u00a0[20]: Copied! slice_it = slicing_benchmark(tables[-1])\n slice_it = slicing_benchmark(tables[-1]) reading 50,000,000 rows to find 20 rows took 1.435 sec\n In\u00a0[22]: Copied! def column_selection_benchmark(tables):\n results = Table()\n results.add_columns( 'rows')\n results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)])\n\n for table in tables:\n rr = [len(table)]\n for ix, name in enumerate(table.columns):\n cols = list(table.columns)[:ix+1]\n start_time = perf_counter()\n table[cols]\n end_time = perf_counter()\n rr.append(f\"{end_time-start_time:.5f}\")\n results.add_rows( rr )\n return results\n def column_selection_benchmark(tables): results = Table() results.add_columns( 'rows') results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)]) for table in tables: rr = [len(table)] for ix, name in enumerate(table.columns): cols = list(table.columns)[:ix+1] start_time = perf_counter() table[cols] end_time = perf_counter() rr.append(f\"{end_time-start_time:.5f}\") results.add_rows( rr ) return results In\u00a0[23]: Copied! csb = column_selection_benchmark(tables)\nprint(\"times below are are in seconds\")\ncsb\n csb = column_selection_benchmark(tables) print(\"times below are are in seconds\") csb times below are are in seconds\n Out[23]: #rowsn cols=1n cols=2n cols=3n cols=4n cols=5n cols=6n cols=7n cols=8n cols=9n cols=10n cols=11n cols=12 010000000.000010.000060.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 120000000.000010.000080.000030.000030.000030.000030.000030.000030.000030.000030.000040.00004 250000000.000010.000050.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 3100000000.000020.000050.000040.000040.000040.000040.000070.000050.000050.000050.000050.00005 4200000000.000030.000060.000050.000050.000050.000050.000060.000060.000060.000060.000060.00006 5500000000.000090.000110.000100.000090.000090.000090.000090.000090.000090.000090.000100.00009 In\u00a0[33]: Copied! def iterrows_benchmark(table):\n results = Table()\n results.add_columns( 'n columns', 'time (s)')\n\n columns = ['1']\n for column in list(table.columns):\n columns.append(column)\n snip = table[columns, slice(500_000,1_500_000)]\n start_time = perf_counter()\n counts = 0\n for row in snip.rows:\n counts += 1\n end_time = perf_counter()\n results.add_rows( len(columns), round(end_time-start_time,3))\n\n return results\n def iterrows_benchmark(table): results = Table() results.add_columns( 'n columns', 'time (s)') columns = ['1'] for column in list(table.columns): columns.append(column) snip = table[columns, slice(500_000,1_500_000)] start_time = perf_counter() counts = 0 for row in snip.rows: counts += 1 end_time = perf_counter() results.add_rows( len(columns), round(end_time-start_time,3)) return results In\u00a0[34]: Copied! iterb = iterrows_benchmark(tables[-1])\niterb\n iterb = iterrows_benchmark(tables[-1]) iterb Out[34]: #n columnstime (s) 029.951 139.816 249.859 359.93 469.985 579.942 689.958 799.867 8109.96 9119.93210129.8311139.861 In\u00a0[35]: Copied! import matplotlib.pyplot as plt\nplt.plot(iterb['n columns'], iterb['time (s)'])\nplt.show()\n import matplotlib.pyplot as plt plt.plot(iterb['n columns'], iterb['time (s)']) plt.show() In\u00a0[28]: Copied! tables[-1].types()\n tables[-1].types() Out[28]: {'#': {int: 50000000},\n '1': {int: 50000000},\n '2': {str: 50000000},\n '3': {int: 50000000},\n '4': {int: 50000000},\n '5': {int: 50000000},\n '6': {str: 50000000},\n '7': {str: 50000000},\n '8': {str: 50000000},\n '9': {str: 50000000},\n '10': {float: 50000000},\n '11': {str: 50000000}} In\u00a0[29]: Copied! def dtypes_benchmark(tables):\n dtypes_results = Table()\n dtypes_results.add_columns(\"rows\", \"time (s)\")\n\n for table in tables:\n start_time = perf_counter()\n dt = table.types()\n end_time = perf_counter()\n assert isinstance(dt, dict) and len(dt) != 0\n dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )\n\n return dtypes_results\n def dtypes_benchmark(tables): dtypes_results = Table() dtypes_results.add_columns(\"rows\", \"time (s)\") for table in tables: start_time = perf_counter() dt = table.types() end_time = perf_counter() assert isinstance(dt, dict) and len(dt) != 0 dtypes_results.add_rows( len(table), round(end_time-start_time, 3) ) return dtypes_results In\u00a0[30]: Copied! dtype_b = dtypes_benchmark(tables)\ndtype_b\n dtype_b = dtypes_benchmark(tables) dtype_b Out[30]: #rowstime (s) 010000000.0 120000000.0 250000000.0 3100000000.0 4200000000.0 5500000000.001 In\u00a0[31]: Copied! def any_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n\n for table in tables:\n tmp = [len(table)]\n for column in list(table.columns):\n v = table[column][0]\n start_time = perf_counter()\n _ = table.any(**{column: v})\n end_time = perf_counter() \n tmp.append(round(end_time-start_time,3))\n\n results.add_rows( tmp )\n return results\n def any_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: tmp = [len(table)] for column in list(table.columns): v = table[column][0] start_time = perf_counter() _ = table.any(**{column: v}) end_time = perf_counter() tmp.append(round(end_time-start_time,3)) results.add_rows( tmp ) return results In\u00a0[32]: Copied! anyb = any_benchmark(tables)\nanyb\n anyb = any_benchmark(tables) anyb Out[32]: ~rows#1234567891011 010000000.1330.1330.1780.1330.2920.1470.1690.1430.2270.2590.1460.17 120000000.2680.2630.3430.2650.5670.2940.3350.2750.4640.5230.2890.323 250000000.6690.6530.9140.6691.4360.7230.8380.6941.1741.3350.6780.818 3100000001.3141.351.7451.3362.9021.491.6831.4142.3542.6181.3431.536 4200000002.5562.5343.3372.6025.6452.8273.2252.6464.5145.082.6933.083 5500000006.5716.4238.4556.69914.4847.9897.7986.25910.98912.486.7327.767 In\u00a0[36]: Copied! def all_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n\n for table in tables:\n tmp = [len(table)]\n for column in list(table.columns):\n v = table[column][0]\n start_time = perf_counter()\n _ = table.all(**{column: v})\n end_time = perf_counter() \n tmp.append(round(end_time-start_time,3))\n\n results.add_rows( tmp )\n return results\n def all_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: tmp = [len(table)] for column in list(table.columns): v = table[column][0] start_time = perf_counter() _ = table.all(**{column: v}) end_time = perf_counter() tmp.append(round(end_time-start_time,3)) results.add_rows( tmp ) return results In\u00a0[37]: Copied! allb = all_benchmark(tables)\nallb\n allb = all_benchmark(tables) allb Out[37]: ~rows#1234567891011 010000000.120.1210.1620.1220.2640.1380.1550.1270.2090.2370.1330.151 120000000.2370.2350.3110.2380.520.2660.2970.3410.4510.530.2610.285 250000000.6750.6980.9520.5941.6050.6590.8120.7191.2241.3530.6640.914 3100000001.3141.3321.7071.3323.0911.4631.7811.3662.3582.6381.4091.714 4200000002.5762.3133.112.3965.2072.5732.9212.4034.0414.6582.4632.808 5500000005.8965.827.735.95612.9097.457.275.98110.18311.5766.3727.414 In\u00a0[\u00a0]: Copied! \n In\u00a0[38]: Copied! def unique_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n \n for table in tables:\n length = len(table)\n\n tmp = [len(table)]\n for column in list(table.columns):\n start_time = perf_counter()\n try:\n L = table[column].unique()\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n tmp.append(round(dt,3))\n assert 0 < len(L) <= length \n\n results.add_rows( tmp )\n return results\n def unique_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: length = len(table) tmp = [len(table)] for column in list(table.columns): start_time = perf_counter() try: L = table[column].unique() dt = perf_counter() - start_time except MemoryError: dt = -1 tmp.append(round(dt,3)) assert 0 < len(L) <= length results.add_rows( tmp ) return results In\u00a0[39]: Copied! ubm = unique_benchmark(tables)\nubm\n ubm = unique_benchmark(tables) ubm Out[39]: ~rows#1234567891011 010000000.0220.0810.2480.0440.0160.0610.1150.1360.0960.0850.0940.447 120000000.1760.2710.5050.0870.0310.1240.2290.2790.1980.170.3051.471 250000000.1980.4991.2630.2180.0760.3110.570.6850.4740.4250.5952.744 3100000000.5021.1232.5350.4330.1550.6151.1281.3750.960.851.3165.826 4200000000.9562.3365.0350.8830.3191.2292.2682.7481.9131.7462.73311.883 5500000002.3956.01912.4992.1780.7643.0735.6086.8194.8284.2797.09730.511 In\u00a0[40]: Copied! def index_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n \n for table in tables:\n\n tmp = [len(table)]\n for column in list(table.columns):\n start_time = perf_counter()\n try:\n _ = table.index(column)\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n tmp.append(round(dt,3))\n \n results.add_rows( tmp )\n return results\n def index_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: tmp = [len(table)] for column in list(table.columns): start_time = perf_counter() try: _ = table.index(column) dt = perf_counter() - start_time except MemoryError: dt = -1 tmp.append(round(dt,3)) results.add_rows( tmp ) return results In\u00a0[41]: Copied! ibm = index_benchmark(tables)\nibm\n ibm = index_benchmark(tables) ibm Out[41]: ~rows#1234567891011 010000001.9491.7931.4321.1061.0511.231.3381.4931.4111.3031.9992.325 120000002.8833.5172.8562.2172.1242.4622.6762.9862.7092.6064.0494.461 250000006.3829.0497.0965.6285.3536.3126.6497.5216.716.45910.2710.747 31000000012.55318.50613.9511.33510.72412.50913.3315.05113.50212.89919.76921.999 42000000024.71737.89628.56822.66621.47226.32727.15730.06427.33225.82238.31143.399 55000000063.01697.07772.00755.60954.09961.79768.23675.0769.02266.15299.183109.969 Multi-column index next: In\u00a0[42]: Copied! def multi_column_index_benchmark(tables):\n \n selection = [\"4\", \"7\", \"8\", \"9\"]\n results = Table()\n results.add_columns(\"rows\", *range(1,len(selection)+1))\n \n for table in tables:\n\n tmp = [len(table)]\n for index in range(1,5):\n start_time = perf_counter()\n try:\n _ = table.index(*selection[:index])\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n tmp.append(round(dt,3))\n print('.', end='')\n \n results.add_rows( tmp )\n return results\n def multi_column_index_benchmark(tables): selection = [\"4\", \"7\", \"8\", \"9\"] results = Table() results.add_columns(\"rows\", *range(1,len(selection)+1)) for table in tables: tmp = [len(table)] for index in range(1,5): start_time = perf_counter() try: _ = table.index(*selection[:index]) dt = perf_counter() - start_time except MemoryError: dt = -1 tmp.append(round(dt,3)) print('.', end='') results.add_rows( tmp ) return results In\u00a0[43]: Copied! mcib = multi_column_index_benchmark(tables)\nmcib\n mcib = multi_column_index_benchmark(tables) mcib ........................ Out[43]: #rows1234 010000001.0582.1333.2154.052 120000002.124.2786.5468.328 250000005.30310.8916.69320.793 31000000010.58122.40733.46241.91 42000000021.06445.95467.78184.828 55000000052.347109.551166.6211.053 In\u00a0[44]: Copied! def drop_duplicates_benchmark(tables):\n results = Table()\n results.add_columns(\"rows\", *list(tables[0].columns))\n \n for table in tables:\n result = [len(table)]\n cols = []\n for name in list(table.columns):\n cols.append(name)\n start_time = perf_counter()\n try:\n _ = table.drop_duplicates(*cols)\n dt = perf_counter() - start_time\n except MemoryError:\n dt = -1\n result.append(round(dt,3))\n print('.', end='')\n \n results.add_rows( result )\n return results\n def drop_duplicates_benchmark(tables): results = Table() results.add_columns(\"rows\", *list(tables[0].columns)) for table in tables: result = [len(table)] cols = [] for name in list(table.columns): cols.append(name) start_time = perf_counter() try: _ = table.drop_duplicates(*cols) dt = perf_counter() - start_time except MemoryError: dt = -1 result.append(round(dt,3)) print('.', end='') results.add_rows( result ) return results In\u00a0[45]: Copied! ddb = drop_duplicates_benchmark(tables)\nddb\n ddb = drop_duplicates_benchmark(tables) ddb ........................................................................ Out[45]: ~rows#1234567891011 010000001.7612.3583.3133.9014.6154.9615.8356.5347.4548.1088.8039.682 120000003.0114.936.9347.979.26410.26812.00613.51714.9216.63117.93219.493 250000006.82713.85318.63721.23724.54827.1131.15735.02638.99243.53146.02250.433 31000000013.23831.74641.14146.91753.17258.24167.99274.65182.7491.45897.666104.82 42000000025.93277.75100.34109.314123.514131.874148.432163.57179.121196.047208.686228.059 55000000064.237312.222364.886388.249429.724466.685494.418535.367581.666607.306634.343683.858"},{"location":"benchmarks/#benchmarks","title":"Benchmarks\u00b6","text":"These benchmarks seek to establish the performance of tablite as a user sees it. Overview Input/Output Various column functions Base functions Core functions - Save / Load .tpz format- Save tables to various formats- Import data from various formats - Setitem / getitem- iter- equal, not equal- copy- t += t- t *= t- contains- remove all- replace- index- unique- histogram- statistics- count - Setitem / getitem- iter / rows- equal, not equal- load- save- copy- stack- types- display_dict- show- to_dict- as_json_serializable- index - expression- filter- sort_index- reindex- drop_duplicates- sort- is_sorted- any- all- drop - replace- groupby- pivot- joins- lookup- replace missing values- transpose- pivot_transpose- diff"},{"location":"benchmarks/#input-output","title":"Input / Output\u00b6","text":""},{"location":"benchmarks/#create-tables-from-synthetic-data","title":"Create tables from synthetic data.\u00b6","text":""},{"location":"benchmarks/#save-load-tpz-format","title":"Save / Load .tpz format\u00b6","text":"Without default compression settings (10% slower than uncompressed, 20% of uncompressed filesize) "},{"location":"benchmarks/#save-load-tables-to-from-various-formats","title":"Save / load tables to / from various formats\u00b6","text":"The handlers for saving / export are: - to_sql
- to_json
- to_xls
- to_ods
- to_csv
- to_tsv
- to_text
- to_html
- to_hdf5
"},{"location":"benchmarks/#various-column-functions","title":"Various column functions\u00b6","text":" - Setitem / getitem
- iter
- equal, not equal
- copy
- t += t
- t *= t
- contains
- remove all
- replace
- index
- unique
- histogram
- statistics
- count
"},{"location":"benchmarks/#various-table-functions","title":"Various table functions\u00b6","text":""},{"location":"benchmarks/#slicing","title":"Slicing\u00b6","text":"Slicing operations are used in many places. "},{"location":"benchmarks/#tabletypes","title":"Table.types()\u00b6","text":"Table.types() is implemented for near constant speed lookup. Here is an example: "},{"location":"benchmarks/#tableany","title":"Table.any\u00b6","text":""},{"location":"benchmarks/#tableall","title":"Table.all\u00b6","text":""},{"location":"benchmarks/#tablefilter","title":"Table.filter\u00b6","text":""},{"location":"benchmarks/#tableunique","title":"Table.unique\u00b6","text":""},{"location":"benchmarks/#tableindex","title":"Table.index\u00b6","text":"Single column index first: "},{"location":"benchmarks/#drop-duplicates","title":"drop duplicates\u00b6","text":""},{"location":"changelog/","title":"Changelog","text":"Version Change 2023.9.0 Adding Table.match operation. 2023.8.0 Nim backend for csv importer.Improve excel importer.Improve slicing consistency.Logical cores re-enabled on *nix based systems.Filter is now type safe.Added merge utility.Various bugfixes. 2023.6.5 Fix issues with get_headers falling back to text reading when reading 0 lines of excel, fix issue where reading excel file would ignore file count, excel file reader now has parity for linecount selection. 2023.6.4 Fix a logic bug in get_headers that caused one extra line to be returned than requested. 2023.6.3 Updated the way reference counting works. Tablite now tracks references to used pages and cleans them up based on number of references to those pages in the current process. This change allows to handle deep table clones when sending tables via processes (pickling/unpickling), whereas previous implementation would corrupt all tables using same pages due to reference counting asserting that all tables are shallow copies to the same object. 2023.6.2 Updated mplite dependency, changed to soft version requirement to prevent pipeline freezes due to small bugfixes in mplite . 2023.6.1 Major change of the backend processes. Speed up of ~6x. For more see the release notes 2022.11.19 Fixed some memory leaks. 2022.11.18 copy , filter , sort , any , all methods now properly respects the table subclass.Filter for tables with under SINGLE_PROCESSING_LIMIT rows will run on same process to reduce overhead.Errors within child processes now properly propagate to parent.Table.reset_storage(include_imports=True) now allows the user to reset the storage but exclude any imported files by setting include_imports=False during Table.reset(...) .Bug: A column with 1,None,2 would be written to csv & tsv as \"1,None,2\" . Now it is written \"1,,2\" where None means absent.Fix mp join producing mismatched columns lengths when different table lengths are used as an input or when join product is longer than the input table. 2022.11.17 Table.load now properly subclassess the table instead of always resulting in tablite.Table .Table.from_* methods now respect subclassess, fixed some from_* methods which were instance methods and not class methods.Fixed Table.from_dict only accepting list and tuple but not tablite.Column which is an equally valid type.Fix lookup parity in single process and multiple process outputs.Fix an issue with multiprocess lookup where no matches would throw instead of producing None .Fix an issue with filtering an empty table. 2022.11.16 Changed join to process 1M rows per task to avoid potential OOM on lower memory systems. Added mp_merge_columns to MemoryManager that merges column pages into a single column.Fix join parity in single process and multiple process outputs.Fix an issue with multiprocess join where no matches would throw instead of producing None . 2022.11.15 Bump mplite to avoid deadlock issues OS kill the process. 2022.11.14 Improve locking mechanism to allow retries when opening file as the previous solution could cause deadlocks when running multiple threads. 2022.11.13 Fix an issue with copying empty pages. 2022.11.12 Tablite now is now able to create it's own temporary directory. 2022.11.11 text_reader tqdm tracks the entire process now. text_reader properly respects free memory in *nix based systems. text_reader no longer discriminates against hyperthreaded cores. 2022.11.10 get_headers now uses plain openpyxl instead of pyexcel wrapper to speed up fetch times ~10x on certain files. 2022.11.9 get_headers can fail safe on unrecognized characters. 2022.11.8 Fix a bug with task size calculation on single core systems. 2022.11.7 Added TABLITE_TMPDIR environment variable for setting tablite work directory. Characters that fail to be read text reader due to improper encoding will be skipped. Fixed an issue where single column text files with no column delimiters would be imported as empty tables. 2022.11.6 Date inference fix 2022.11.5 Fixed negative slicing issues 2022.11.4 Transpose API changes: table.transpose(...) was renamed to table.pivot_transpose(...) new table.transpose() and table.T were added, it's functionality acts similarly to numpy.T , the column headers are used the first row in the table when transposing. 2022.11.3 Bugfix for non-ascii encoded strings during t.add_rows(...) 2022.11.2 As utf-8 is ascii compatible, the file reader utils selects utf-8 instead of ascii as a default. 2022.11.1 bugfix in datatypes.infer() where 1 was inferred as int, not float. 2022.11.0 New table features: Table.diff(other, columns=...) , table.remove_duplicates_rows() , table.drop_na(*arg) ,table.replace(target,replacement) , table.imputation(sources, targets, methods=...) , table.to_pandas() and Table.from_pandas(pd.DataFrame) ,table.to_dict(columns, slice) , Table.from_dict() ,table.transpose(columns, keep, ...) , New column features: Column.count(item) , Column[:] is guaranteed to return a python list.Column.to_numpy(slice) returns np.ndarray . new tools library: from tablite import tools with: date_range(start,end) , xround(value, multiple, up=None) , and, guess as short-cut for Datatypes.guess(...) . bugfixes: __eq__ was updated but missed __ne__ .in operator in filter would crash if datatypes were not strings. 2022.10.11 filter now accepts any expression (str) that can be compiled by pythons compiler 2022.10.11 Bugfix for .any and .all . The code now executes much faster 2022.10.10 Bugfix for Table.import_file : import_as has been removed from keywords. 2022.10.10 All Table functions now have tqdm progressbar. 2022.10.10 More robust calculation for task size for multiprocessing. 2022.10.10 Dependency update: mplite==1.2.0 is now required. 2022.10.9 Bugfix for Table.import_file : files with duplicate header names would only have last duplicate name imported.Now the headers are made unique using name_x where x is a number. 2022.10.8 Bugfix for groupby: Where keys are empty error should have been raised.Where there are no functions, unique keypairs are returned. 2022.10.7 Bugfix for Column.statistics() for an empty column 2022.10.6 Bugfix for __setitem__ : tbl['a'] = [] is now seen as tbl.add_column('a') Bugfix for __getitem__ : calling a missing key raises keyerror. 2022.10.5 Bugfix for summary statistics. 2022.10.4 Bugfix for join shortcut. 2022.10.3 Bugfix for DataTypes where bool was evaluated wrongly 2022.10.0 Added ability to reindex in table.reindex(index=[0,1...,n,n-1]) 2022.9.0 Added ability to store python objects (example).Added warning when user iterates over non-rectangular dataset. 2022.8.0 Added table.export(path) which exports tablite Tables to file format given by the file extension. For example my_table.export('example.xlsx') .supported formats are: json , html , xlsx , xls , csv , tsv , txt , ods and sql . 2022.7.8 Added ability to forward tqdm progressbar into Table.import_file(..., tqdm=your_tqdm) , so that Jupyter notebook can use it in display -methods. 2022.7.7 Added method Table.to_sql() for export to ANSI-92 SQL enginesBugfix on to_json for timedelta . Jupyter notebook provides nice view using Table._repr_html_() JS-users can use .as_json_serializable where suitable. 2022.7.6 get_headers now takes argument (path, linecount=10) 2022.7.5 added helper Table.as_json_serializable as Jupyterkernel compat. 2022.7.4 adder helper Table.to_dict , and updated Table.to_json 2022.7.3 table.to_json now takes kwargs: row_count , columns , slice_ , start_on 2022.7.2 documentation update. 2022.7.1 minor bugfix. 2022.7.0 BREAKING CHANGES- Tablite now uses HDF5 as backend. - Has multiprocessing enabled by default. - Is 20x faster. - Completely new API. 2022.6.0 DataTypes.guess([list of strings]) returns the best matching python datatype."},{"location":"tutorial/","title":"Tutorial","text":"In\u00a0[1]: Copied! from tablite import Table\n\n## To create a tablite table is as simple as populating a dictionary:\nt = Table({'A':[1,2,3], 'B':['a','b','c']})\n from tablite import Table ## To create a tablite table is as simple as populating a dictionary: t = Table({'A':[1,2,3], 'B':['a','b','c']}) In\u00a0[2]: Copied! ## In this notebook we can show tables in the HTML style:\nt\n ## In this notebook we can show tables in the HTML style: t Out[2]: #AB 01a 12b 23c In\u00a0[3]: Copied! ## or the ascii style:\nt.show()\n ## or the ascii style: t.show() +==+=+=+\n|# |A|B|\n+--+-+-+\n| 0|1|a|\n| 1|2|b|\n| 2|3|c|\n+==+=+=+\n In\u00a0[4]: Copied! ## or if you'd like to inspect the table, use:\nprint(str(t))\n ## or if you'd like to inspect the table, use: print(str(t)) Table(2 columns, 3 rows)\n In\u00a0[5]: Copied! ## You can also add all columns at once (slower) if you prefer. \nt2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c')))\nassert t==t2\n ## You can also add all columns at once (slower) if you prefer. t2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c'))) assert t==t2 In\u00a0[6]: Copied! ## or load data:\nt3 = Table.from_file('tests/data/book1.csv')\n\n## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show(). \n## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows.\nt3\n ## or load data: t3 = Table.from_file('tests/data/book1.csv') ## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show(). ## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows. t3 Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 487.82it/s]\n Out[6]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[7]: Copied! ## should you however want to select the headers instead of importing everything\n## (which maybe timeconsuming), simply use get_headers(path)\nfrom tablite.tools import get_headers\nfrom pathlib import Path\npath = Path('tests/data/book1.csv')\nsample = get_headers(path, linecount=5)\nprint(f\"sample is of type {type(sample)} and has the following entries:\")\nfor k,v in sample.items():\n print(k)\n if isinstance(v,list):\n for r in sample[k]:\n print(\"\\t\", r)\n ## should you however want to select the headers instead of importing everything ## (which maybe timeconsuming), simply use get_headers(path) from tablite.tools import get_headers from pathlib import Path path = Path('tests/data/book1.csv') sample = get_headers(path, linecount=5) print(f\"sample is of type {type(sample)} and has the following entries:\") for k,v in sample.items(): print(k) if isinstance(v,list): for r in sample[k]: print(\"\\t\", r) sample is of type <class 'dict'> and has the following entries:\ndelimiter\nbook1.csv\n\t ['a', 'b', 'c', 'd', 'e', 'f']\n\t ['1', '0.060606061', '0.090909091', '0.121212121', '0.151515152', '0.181818182']\n\t ['2', '0.121212121', '0.242424242', '0.484848485', '0.96969697', '1.939393939']\n\t ['3', '0.242424242', '0.484848485', '0.96969697', '1.939393939', '3.878787879']\n\t ['4', '0.484848485', '0.96969697', '1.939393939', '3.878787879', '7.757575758']\n\t ['5', '0.96969697', '1.939393939', '3.878787879', '7.757575758', '15.51515152']\n In\u00a0[8]: Copied! ## to extend a table by adding columns, use t[new] = [new values]\nt['C'] = [4,5,6]\n## but make sure the column has the same length as the rest of the table!\nt\n ## to extend a table by adding columns, use t[new] = [new values] t['C'] = [4,5,6] ## but make sure the column has the same length as the rest of the table! t Out[8]: #ABC 01a4 12b5 23c6 In\u00a0[9]: Copied! ## should you want to mix datatypes, tablite will not complain:\nfrom datetime import datetime, date,time,timedelta\nimport numpy as np\n## What you put in ...\nt4 = Table()\nt4['mixed'] = [\n -1,0,1, # regular integers\n -12345678909876543211234567890987654321, # very very large integer\n None,np.nan, # null values \n \"one\", \"\", # strings\n True,False, # booleans\n float('inf'), 0.01, # floats\n date(2000,1,1), # date\n datetime(2002,2,3,23,0,4,6660), # datetime\n time(12,12,12), # time\n timedelta(days=3, seconds=5678) # timedelta\n]\n## ... is exactly what you get out:\nt4\n ## should you want to mix datatypes, tablite will not complain: from datetime import datetime, date,time,timedelta import numpy as np ## What you put in ... t4 = Table() t4['mixed'] = [ -1,0,1, # regular integers -12345678909876543211234567890987654321, # very very large integer None,np.nan, # null values \"one\", \"\", # strings True,False, # booleans float('inf'), 0.01, # floats date(2000,1,1), # date datetime(2002,2,3,23,0,4,6660), # datetime time(12,12,12), # time timedelta(days=3, seconds=5678) # timedelta ] ## ... is exactly what you get out: t4 Out[9]: #mixed 0-1 10 21 3-12345678909876543211234567890987654321 4None 5nan 6one 7 8True 9False10inf110.01122000-01-01132002-02-03 23:00:04.0066601412:12:12153 days, 1:34:38 In\u00a0[10]: Copied! ## also if you claim the values back as a python list:\nfor item in list(t4['mixed']):\n print(item)\n ## also if you claim the values back as a python list: for item in list(t4['mixed']): print(item) -1\n0\n1\n-12345678909876543211234567890987654321\nNone\nnan\none\n\nTrue\nFalse\ninf\n0.01\n2000-01-01\n2002-02-03 23:00:04.006660\n12:12:12\n3 days, 1:34:38\n The column itself (__repr__ ) shows us the pid , file location and the entries, so you know exactly what you're working with. In\u00a0[11]: Copied! t4['mixed']\n t4['mixed'] Out[11]: Column(/tmp/tablite-tmp/pid-54911, [-1 0 1 -12345678909876543211234567890987654321 None nan 'one' '' True\n False inf 0.01 datetime.date(2000, 1, 1)\n datetime.datetime(2002, 2, 3, 23, 0, 4, 6660) datetime.time(12, 12, 12)\n datetime.timedelta(days=3, seconds=5678)]) In\u00a0[12]: Copied! ## to view the datatypes in a column, use Column.types()\ntype_dict = t4['mixed'].types()\nfor k,v in type_dict.items():\n print(k,v)\n ## to view the datatypes in a column, use Column.types() type_dict = t4['mixed'].types() for k,v in type_dict.items(): print(k,v) <class 'int'> 4\n<class 'NoneType'> 1\n<class 'float'> 3\n<class 'str'> 2\n<class 'bool'> 2\n<class 'datetime.date'> 1\n<class 'datetime.datetime'> 1\n<class 'datetime.time'> 1\n<class 'datetime.timedelta'> 1\n In\u00a0[13]: Copied! ## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file.\n## This is because tablite guesses the most probable datatype using the `.guess` function on each column.\n## You can use the .guess function like this:\nfrom tablite import DataTypes\nt3['a'] = DataTypes.guess(t3['a'])\n## You can also convert the datatype using a list comprehension\nt3['b'] = [float(v) for v in t3['b']]\nt3\n ## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file. ## This is because tablite guesses the most probable datatype using the `.guess` function on each column. ## You can use the .guess function like this: from tablite import DataTypes t3['a'] = DataTypes.guess(t3['a']) ## You can also convert the datatype using a list comprehension t3['b'] = [float(v) for v in t3['b']] t3 Out[13]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[14]: Copied! t = Table()\nfor column_name in 'abcde':\n t[column_name] =[i for i in range(5)]\n t = Table() for column_name in 'abcde': t[column_name] =[i for i in range(5)] (2) we want to add two new columns using the functions: In\u00a0[15]: Copied! def f1(a,b,c):\n return a+b+c+1\ndef f2(b,c,d):\n return b*c*d\n def f1(a,b,c): return a+b+c+1 def f2(b,c,d): return b*c*d (3) and we want to compute two new columns f and g : In\u00a0[16]: Copied! t.add_columns('f', 'g')\n t.add_columns('f', 'g') (4) we can now use the filter, to iterate over the table, and add the values to the two new columns: In\u00a0[17]: Copied! f,g=[],[]\nfor row in t['a', 'b', 'c', 'd'].rows:\n a, b, c, d = row\n\n f.append(f1(a, b, c))\n g.append(f2(b, c, d))\nt['f'] = f\nt['g'] = g\n\nassert len(t) == 5\nassert list(t.columns) == list('abcdefg')\nt\n f,g=[],[] for row in t['a', 'b', 'c', 'd'].rows: a, b, c, d = row f.append(f1(a, b, c)) g.append(f2(b, c, d)) t['f'] = f t['g'] = g assert len(t) == 5 assert list(t.columns) == list('abcdefg') t Out[17]: #abcdefg 00000010 11111141 22222278 3333331027 4444441364 Take note that if your dataset is assymmetric, a warning will be show: In\u00a0[18]: Copied! assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]})\nfor row in assymmetric_table.rows:\n print(row)\n## warning at the bottom ---v\n assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]}) for row in assymmetric_table.rows: print(row) ## warning at the bottom ---v [1, 1]\n[2, 2]\n[3, None]\n /home/bjorn/github/tablite/tablite/base.py:1188: UserWarning: Column b has length 2 / 3. None will appear as fill value.\n warnings.warn(f\"Column {name} has length {len(column)} / {n_max}. None will appear as fill value.\")\n In\u00a0[19]: Copied! table7 = Table(columns={\n'A': [1,1,2,2,3,4],\n'B': [1,1,2,2,30,40],\n'C': [-1,-2,-3,-4,-5,-6]\n})\nindex = table7.index('A', 'B')\nfor k, v in index.items():\n print(\"key\", k, \"indices\", v)\n table7 = Table(columns={ 'A': [1,1,2,2,3,4], 'B': [1,1,2,2,30,40], 'C': [-1,-2,-3,-4,-5,-6] }) index = table7.index('A', 'B') for k, v in index.items(): print(\"key\", k, \"indices\", v) key (1, 1) indices [0, 1]\nkey (2, 2) indices [2, 3]\nkey (3, 30) indices [4]\nkey (4, 40) indices [5]\n The keys are created for each unique column-key-pair, and the value is the index where the key is found. To fetch all rows for key (2,2) , we can use: In\u00a0[20]: Copied! for ix, row in enumerate(table7.rows):\n if ix in index[(2,2)]:\n print(row)\n for ix, row in enumerate(table7.rows): if ix in index[(2,2)]: print(row) [2, 2, -3]\n[2, 2, -4]\n In\u00a0[21]: Copied! ## to append one table to another, use + or += \nprint('length before:', len(t3)) # length before: 45\nt5 = t3 + t3 \nprint('length after +', len(t5)) # length after + 90\nt5 += t3 \nprint('length after +=', len(t5)) # length after += 135\n## if you need a lot of numbers for a test, you can repeat a table using * and *=\nt5 *= 1_000\nprint('length after +=', len(t5)) # length after += 135000\n ## to append one table to another, use + or += print('length before:', len(t3)) # length before: 45 t5 = t3 + t3 print('length after +', len(t5)) # length after + 90 t5 += t3 print('length after +=', len(t5)) # length after += 135 ## if you need a lot of numbers for a test, you can repeat a table using * and *= t5 *= 1_000 print('length after +=', len(t5)) # length after += 135000 length before: 45\nlength after + 90\nlength after += 135\nlength after += 135000\n In\u00a0[22]: Copied! t5\n t5 Out[22]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606..................... 134,9933916659267088.033318534175.066637068350.0133274000000.0266548000000.0 134,9944033318534175.066637068350.0133274000000.0266548000000.0533097000000.0 134,9954166637068350.0133274000000.0266548000000.0533097000000.01066190000000.0 134,99642133274000000.0266548000000.0533097000000.01066190000000.02132390000000.0 134,99743266548000000.0533097000000.01066190000000.02132390000000.04264770000000.0 134,99844533097000000.01066190000000.02132390000000.04264770000000.08529540000000.0 134,999451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[23]: Copied! ## if your are in doubt whether your tables will be the same you can use .stack(other)\nassert t.columns != t2.columns # compares list of column names.\nt6 = t.stack(t2)\nt6\n ## if your are in doubt whether your tables will be the same you can use .stack(other) assert t.columns != t2.columns # compares list of column names. t6 = t.stack(t2) t6 Out[23]: #abcdefgAB 00000010NoneNone 11111141NoneNone 22222278NoneNone 3333331027NoneNone 4444441364NoneNone 5NoneNoneNoneNoneNoneNoneNone1a 6NoneNoneNoneNoneNoneNoneNone2b 7NoneNoneNoneNoneNoneNoneNone3c In\u00a0[24]: Copied! ## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns.\n\n## if you need a more detailed view of the columns you can iterate:\nfor name in t.columns:\n col_from_t = t[name]\n if name in t2.columns:\n col_from_t2 = t2[name]\n print(name, col_from_t == col_from_t2)\n else:\n print(name, \"not in t2\")\n ## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns. ## if you need a more detailed view of the columns you can iterate: for name in t.columns: col_from_t = t[name] if name in t2.columns: col_from_t2 = t2[name] print(name, col_from_t == col_from_t2) else: print(name, \"not in t2\") a not in t2\nb not in t2\nc not in t2\nd not in t2\ne not in t2\nf not in t2\ng not in t2\n In\u00a0[25]: Copied! ## to make a copy of a table, use table.copy()\nt3_copy = t3.copy()\n\n## you can also perform multi criteria selections using getitem [ ... ]\nt3_slice = t3['a','b','d', 5:25:5]\nt3_slice\n ## to make a copy of a table, use table.copy() t3_copy = t3.copy() ## you can also perform multi criteria selections using getitem [ ... ] t3_slice = t3['a','b','d', 5:25:5] t3_slice Out[25]: #abd 061.9393939397.757575758 11162.06060606248.2424242 2161985.9393947943.757576 32163550.06061254200.2424 In\u00a0[26]: Copied! ##deleting items also works the same way:\ndel t3_slice[1:3] # delete row number 2 & 3 \nt3_slice\n ##deleting items also works the same way: del t3_slice[1:3] # delete row number 2 & 3 t3_slice Out[26]: #abd 061.9393939397.757575758 12163550.06061254200.2424 In\u00a0[27]: Copied! ## to wipe a table, use .clear:\nt3_slice.clear()\nt3_slice\n ## to wipe a table, use .clear: t3_slice.clear() t3_slice Out[27]: Empty Table In\u00a0[28]: Copied! ## tablite uses .npy for storage because it is fast.\n## this means you can make a table persistent using .save\nlocal_file = Path(\"local_file.tpz\")\nt5.save(local_file)\n\nold_t5 = Table.load(local_file)\nprint(\"the t5 table had\", len(old_t5), \"rows\") # the t5 table had 135000 rows\n\ndel old_t5 # only removes the in-memory object\n\nprint(\"old_t5 still exists?\", local_file.exists())\nprint(\"path:\", local_file)\n\nimport os\nos.remove(local_file)\n ## tablite uses .npy for storage because it is fast. ## this means you can make a table persistent using .save local_file = Path(\"local_file.tpz\") t5.save(local_file) old_t5 = Table.load(local_file) print(\"the t5 table had\", len(old_t5), \"rows\") # the t5 table had 135000 rows del old_t5 # only removes the in-memory object print(\"old_t5 still exists?\", local_file.exists()) print(\"path:\", local_file) import os os.remove(local_file) loading 'local_file.tpz' file: 55%|\u2588\u2588\u2588\u2588\u2588\u258d | 9851/18000 [00:02<00:01, 4386.96it/s] loading 'local_file.tpz' file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18000/18000 [00:04<00:00, 4417.27it/s]\n the t5 table had 135000 rows\nold_t5 still exists? True\npath: local_file.tpz\n If you want to save a table from one session to another use save=True . This tells the garbage collector to leave the tablite Table on disk, so you can load it again without changing your code. For example: First time you run t = Table.import_file(....big.csv) it may take a minute or two. If you then add t.save=True and restart python, the second time you run t = Table.import_file(....big.csv) it will take a few milliseconds instead of minutes. In\u00a0[29]: Copied! unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]})\n unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]}) In\u00a0[30]: Copied! true,false = unfiltered.filter(\n [\n {\"column1\": 'a', \"criteria\":\">=\", 'value2':3}\n ], filter_type='all'\n)\n true,false = unfiltered.filter( [ {\"column1\": 'a', \"criteria\":\">=\", 'value2':3} ], filter_type='all' ) In\u00a0[31]: Copied! true\n true Out[31]: #ab 0330 1440 In\u00a0[32]: Copied! false.show() # using show here to show that terminal users can have a nice view too.\n false.show() # using show here to show that terminal users can have a nice view too. +==+=+==+\n|# |a|b |\n+--+-+--+\n| 0|1|10|\n| 1|2|20|\n+==+=+==+\n In\u00a0[33]: Copied! ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]})\n ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]}) In\u00a0[34]: Copied! ## typical python\nany(i > 3 for i in ty['a'])\n ## typical python any(i > 3 for i in ty['a']) Out[34]: True In\u00a0[35]: Copied! ## hereby you can do:\nany( ty.any(**{'a':lambda x:x>3}).rows )\n ## hereby you can do: any( ty.any(**{'a':lambda x:x>3}).rows ) Out[35]: True In\u00a0[36]: Copied! ## if you have multiple criteria this also works:\nall( ty.all(**{'a': lambda x:x>=2, 'b': lambda x:x<=30}).rows )\n ## if you have multiple criteria this also works: all( ty.all(**{'a': lambda x:x>=2, 'b': lambda x:x<=30}).rows ) Out[36]: True In\u00a0[37]: Copied! ## or this if you want to see the table.\nty.all(a=lambda x:x>2, b=lambda x:x<=30)\n ## or this if you want to see the table. ty.all(a=lambda x:x>2, b=lambda x:x<=30) Out[37]: #ab 0330 In\u00a0[38]: Copied! ## As `all` and `any` returns tables, this also means that you can chain operations:\nty.any(a=lambda x:x>2).any(b=30)\n ## As `all` and `any` returns tables, this also means that you can chain operations: ty.any(a=lambda x:x>2).any(b=30) Out[38]: #ab 0330 In\u00a0[39]: Copied! table = Table({\n 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9],\n 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10],\n 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0],\n})\ntable\n table = Table({ 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9], 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10], 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0], }) table Out[39]: #ABC 01100 1None1001 2810 3311 4410 5611 65100 77101 89100 In\u00a0[40]: Copied! sort_order = {'B': False, 'C': False, 'A': False}\nassert not table.is_sorted(mapping=sort_order)\n\nsorted_table = table.sort(mapping=sort_order)\nsorted_table\n sort_order = {'B': False, 'C': False, 'A': False} assert not table.is_sorted(mapping=sort_order) sorted_table = table.sort(mapping=sort_order) sorted_table creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 2719.45it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 3434.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 1902.47it/s]\n Sort is reasonable effective as it uses multiprocessing above a million fields. Hint: You can set this limit in tablite.config , like this: In\u00a0[41]: Copied! from tablite.config import Config\nprint(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\")\n from tablite.config import Config print(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\") multiprocessing is used above 1,000,000 fields\n In\u00a0[42]: Copied! import math\nn = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n,\n 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n,\n})\ntable\n import math n = math.ceil(1_000_000 / (9*3)) table = Table({ 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n, 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n, 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n, }) table Out[42]: #ABC 01100 1None1001 2810 3311 4410 5611 65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[43]: Copied! import time as cputime\nstart = cputime.time()\nsort_order = {'B': False, 'C': False, 'A': False}\nsorted_table = table.sort(mapping=sort_order) # sorts 1M values.\nprint(\"table sorting took \", round(cputime.time() - start,3), \"secs\")\nsorted_table\n import time as cputime start = cputime.time() sort_order = {'B': False, 'C': False, 'A': False} sorted_table = table.sort(mapping=sort_order) # sorts 1M values. print(\"table sorting took \", round(cputime.time() - start,3), \"secs\") sorted_table creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 4.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 18.17it/s] table sorting took 0.913 secs\n \n In\u00a0[44]: Copied! n = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n,\n 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n,\n})\ntable\n n = math.ceil(1_000_000 / (9*3)) table = Table({ 'A':[ 1, None, 8, 3, 4, 6, 5, 7, 9]*n, 'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n, 'C':[ 0, 1, 0, 1, 0, 1, 0, 1, 0]*n, }) table Out[44]: #ABC 01100 1None1001 2810 3311 4410 5611 65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[45]: Copied! from tablite import GroupBy as gb\ngrpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)])\ngrpby\n from tablite import GroupBy as gb grpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)]) grpby groupby: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 333342/333342 [00:00<00:00, 427322.50it/s]\n Out[45]: #CBCount(A) 0010111114 1110037038 20174076 31174076 411037038 Here is the list of groupby functions: class GroupBy(object): \n max = Max # shortcuts to avoid having to type a long list of imports.\n min = Min\n sum = Sum\n product = Product\n first = First\n last = Last\n count = Count\n count_unique = CountUnique\n avg = Average\n stdev = StandardDeviation\n median = Median\n mode = Mode\n In\u00a0[46]: Copied! t = Table({\n 'A':[1, 1, 2, 2, 3, 3] * 2,\n 'B':[1, 2, 3, 4, 5, 6] * 2,\n 'C':[6, 5, 4, 3, 2, 1] * 2,\n})\nt\n t = Table({ 'A':[1, 1, 2, 2, 3, 3] * 2, 'B':[1, 2, 3, 4, 5, 6] * 2, 'C':[6, 5, 4, 3, 2, 1] * 2, }) t Out[46]: #ABC 0116 1125 2234 3243 4352 5361 6116 7125 8234 92431035211361 In\u00a0[47]: Copied! t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False)\nt2\n t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False) t2 pivot: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 14/14 [00:00<00:00, 3643.83it/s]\n Out[47]: #CSum(B,A=1)Count(B,A=1)Sum(B,A=2)Count(B,A=2)Sum(B,A=3)Count(B,A=3) 0622NoneNoneNoneNone 1542NoneNoneNoneNone 24NoneNone62NoneNone 33NoneNone82NoneNone 42NoneNoneNoneNone102 51NoneNoneNoneNone122 In\u00a0[48]: Copied! numbers = Table()\nnumbers.add_column('number', data=[ 1, 2, 3, 4, None])\nnumbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue'])\n\nletters = Table()\nletters.add_column('letter', data=[ 'a', 'b', 'c', 'd', None])\nletters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue'])\n numbers = Table() numbers.add_column('number', data=[ 1, 2, 3, 4, None]) numbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue']) letters = Table() letters.add_column('letter', data=[ 'a', 'b', 'c', 'd', None]) letters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue']) In\u00a0[49]: Copied! ## left join\n## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nleft_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nleft_join\n ## left join ## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color left_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) left_join join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1221.94it/s]\n Out[49]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d In\u00a0[50]: Copied! ## inner join\n## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\ninner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\ninner_join\n ## inner join ## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color inner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) inner_join join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1121.77it/s]\n Out[50]: #numberletter 02a 12None 2Nonea 3NoneNone 43b 53d 64b 74d In\u00a0[51]: Copied! # outer join\n## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nouter_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nouter_join\n # outer join ## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color outer_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) outer_join join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1585.15it/s]\n Out[51]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d 9Nonec Q: But ...I think there's a bug in the join... A: Venn diagrams do not explain joins. A Venn diagram is a widely-used diagram style that shows the logical relation between sets, popularised by John Venn in the 1880s. The diagrams are used to teach elementary set theory, and to illustrate simple set relationshipssource: en.wikipedia.org Joins operate over rows and when there are duplicate rows, these will be replicated in the output. Many beginners are surprised by this, because they didn't read the SQL standard. Q: So what do I do? A: If you want to get rid of duplicates using tablite, use the index functionality across all columns and pick the first row from each index. Here's the recipe that starts with plenty of duplicates: In\u00a0[52]: Copied! old_table = Table({\n'A':[1,1,1,2,2,2,3,3,3],\n'B':[1,1,4,2,2,5,3,3,6],\n})\nold_table\n old_table = Table({ 'A':[1,1,1,2,2,2,3,3,3], 'B':[1,1,4,2,2,5,3,3,6], }) old_table Out[52]: #AB 011 111 214 322 422 525 633 733 836 In\u00a0[53]: Copied! ## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE)\nnew_table = old_table.drop_duplicates()\nnew_table\n ## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE) new_table = old_table.drop_duplicates() new_table 9it [00:00, 11329.15it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1819.26it/s]\n Out[53]: #AB 011 114 222 325 433 536 You can also use groupby; We'll get to that in a minute. Lookup is a special case of a search loop: Say for example you are planning a concert and want to make sure that your friends can make it home using public transport: You would have to find the first departure after the concert ends towards their home. A join would only give you a direct match on the time. Lookup allows you \"to iterate through a list of data and find the first match given a set of criteria.\" Here's an example: First we have our list of friends and their stops. In\u00a0[54]: Copied! friends = Table({\n\"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'],\n\"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'],\n})\nfriends\n friends = Table({ \"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'], \"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'], }) friends Out[54]: #namestop 0AliceDowntown-1 1BettyDowntown-2 2CharlieHillside View 3DorethyHillside Crescent 4EdwardDowntown-2 5FredChicago Next we need a list of bus routes and their time and stops. I don't have that, so I'm making one up: In\u00a0[55]: Copied! import random\nrandom.seed(11)\ntable_size = 40\n\ntimes = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)]\nstops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2',\n 'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)]\nroute = [random.choice([1, 2, 3]) for i in stops]\n import random random.seed(11) table_size = 40 times = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)] stops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2', 'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)] route = [random.choice([1, 2, 3]) for i in stops] In\u00a0[56]: Copied! bus_table = Table({\n\"time\":times,\n\"stop\":stops[:table_size],\n\"route\":route[:table_size],\n})\nbus_table.sort(mapping={'time': False})\n\nprint(\"Departures from Concert Hall towards ...\")\nbus_table[0:10]\n bus_table = Table({ \"time\":times, \"stop\":stops[:table_size], \"route\":route[:table_size], }) bus_table.sort(mapping={'time': False}) print(\"Departures from Concert Hall towards ...\") bus_table[0:10] creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1459.90it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 2421.65it/s]\n Departures from Concert Hall towards ...\n Out[56]: #timestoproute 021:02:00Random Road-62 121:05:00Hillside Crescent2 221:06:00Hillside1 321:25:00Random Road-241 421:29:00Random Road-161 521:32:00Random Road-211 621:33:00Random Road-121 721:36:00Random Road-233 821:38:00Central station2 921:38:00Random Road-82 Let's say the concerts ends at 21:00 and it takes a 10 minutes to get to the bus-stop. Earliest departure must then be 21:10 - goodbye hugs included. In\u00a0[57]: Copied! lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"<=\", 'time'), ('stop', \"==\", 'stop'))\nlookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix')\nlookup1_sorted\n lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"<=\", 'time'), ('stop', \"==\", 'stop')) lookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix') lookup1_sorted 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00<00:00, 1513.92it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00<00:00, 2003.65it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 2589.88it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:00<00:00, 2034.29it/s]\n Out[57]: #namestoptimestop_1route 0FredChicagoNoneNoneNone 1BettyDowntown-221:51:00Downtown-21 2EdwardDowntown-221:51:00Downtown-21 3CharlieHillside View22:19:00Hillside View2 4AliceDowntown-123:12:00Downtown-13 5DorethyHillside Crescent23:54:00Hillside Crescent1 Lookup's ability to custom criteria is thereby far more versatile than SQL joins. But with great power comes great responsibility. In\u00a0[58]: Copied! materials = Table({\n 'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9], \n 'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6], \n 'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'], \n 'material_id': [None, None, None, 3, None, 5, 3, 3, 5], \n 'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90]\n})\n # 9 is a partially packed pallet of 6\n\n## multiple values.\nlooking_for = Table({\n 'bom_id': [3,4,6], \n 'moq': [1,2,3]\n })\n materials = Table({ 'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6], 'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'], 'material_id': [None, None, None, 3, None, 5, 3, 3, 5], 'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90] }) # 9 is a partially packed pallet of 6 ## multiple values. looking_for = Table({ 'bom_id': [3,4,6], 'moq': [1,2,3] }) Our goals is now to find the quantity from the materials table based on the items in the looking_for table. This requires two steps: - lookup
- filter for
all by dropping items that didn't match. In\u00a0[59]: Copied! ## step 1/2:\nproducts_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False) \nproducts_lookup\n ## step 1/2: products_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False) products_lookup 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 9/9 [00:00<00:00, 3651.81it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1625.38it/s]\n Out[59]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 011ANone10NoneNone 122irrelevantNone20NoneNone 233empty cartonNone3031 344pkd carton34042 455empty palletNone50NoneNone 566pkd pallet56063 677pkd irrelevant370NoneNone 784ppkd carton38042 896ppkd pallet59063 In\u00a0[60]: Copied! ## step 2/2:\nproducts = products_lookup.all(bom_id_1=lambda x: x is not None)\nproducts\n ## step 2/2: products = products_lookup.all(bom_id_1=lambda x: x is not None) products Out[60]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 The faster way to solve this problem is to use match ! Here is the example: In\u00a0[61]: Copied! products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"))\nproducts_matched\n products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\")) products_matched Out[61]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 In\u00a0[62]: Copied! assert products == products_matched\n assert products == products_matched In\u00a0[63]: Copied! from tablite import Table\nt = Table() # create table\nt.add_columns('row','A','B','C') # add columns\n from tablite import Table t = Table() # create table t.add_columns('row','A','B','C') # add columns The following examples are all valid and append the row (1,2,3) to the table. In\u00a0[64]: Copied! t.add_rows(1, 1, 2, 3) # individual values\nt.add_rows([2, 1, 2, 3]) # list of values\nt.add_rows((3, 1, 2, 3)) # tuple of values\nt.add_rows(*(4, 1, 2, 3)) # unpacked tuple\nt.add_rows(row=5, A=1, B=2, C=3) # keyword - args\nt.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # dict / json.\n t.add_rows(1, 1, 2, 3) # individual values t.add_rows([2, 1, 2, 3]) # list of values t.add_rows((3, 1, 2, 3)) # tuple of values t.add_rows(*(4, 1, 2, 3)) # unpacked tuple t.add_rows(row=5, A=1, B=2, C=3) # keyword - args t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # dict / json. The following examples add two rows to the table In\u00a0[65]: Copied! t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # two (or more) tuples.\nt.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # two or more lists\nt.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}) # two (or more) dicts as args.\nt.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}]) # list of dicts.\n t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # two (or more) tuples. t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # two or more lists t.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3}, {'row': 12, 'A': 4, 'B': 5, 'C': 6}) # two (or more) dicts as args. t.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3}, {'row': 14, 'A': 1, 'B': 2, 'C': 3}]) # list of dicts. In\u00a0[66]: Copied! t\n t Out[66]: #rowABC 01123 12123 23123 34123 45123 56123 67123 78456 89123 9104561011123111245612131231314123 As the row incremented from 1 in the first of these examples, and finished with row: 14 , you can now see the whole table above In\u00a0[67]: Copied! from pathlib import Path\npath = Path('tests/data/book1.csv')\ntx = Table.from_file(path)\ntx\n from pathlib import Path path = Path('tests/data/book1.csv') tx = Table.from_file(path) tx Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 444.08it/s]\n Out[67]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 Note that you can also add start, limit and chunk_size to the file reader. Here's an example: In\u00a0[68]: Copied! path = Path('tests/data/book1.csv')\ntx2 = Table.from_file(path, start=2, limit=15)\ntx2\n path = Path('tests/data/book1.csv') tx2 = Table.from_file(path, start=2, limit=15) tx2 Collecting tasks: 'tests/data/book1.csv'\n importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 391.22it/s] Dumping tasks: 'tests/data/book1.csv'\n \n Out[68]: #abcdef 030.2424242420.4848484850.969696971.9393939393.878787879 140.4848484850.969696971.9393939393.8787878797.757575758 250.969696971.9393939393.8787878797.75757575815.51515152 361.9393939393.8787878797.75757575815.5151515231.03030303 473.8787878797.75757575815.5151515231.0303030362.06060606 587.75757575815.5151515231.0303030362.06060606124.1212121 6915.5151515231.0303030362.06060606124.1212121248.2424242 71031.0303030362.06060606124.1212121248.2424242496.4848485 81162.06060606124.1212121248.2424242496.4848485992.969697 912124.1212121248.2424242496.4848485992.9696971985.9393941013248.2424242496.4848485992.9696971985.9393943971.8787881114496.4848485992.9696971985.9393943971.8787887943.7575761215992.9696971985.9393943971.8787887943.75757615887.5151513161985.9393943971.8787887943.75757615887.5151531775.030314173971.8787887943.75757615887.5151531775.030363550.06061 How good is the file_reader? I've included all formats in the test suite that are publicly available from the Alan Turing institute, dateutils) and Python's csv reader. What about MM-DD-YYYY formats? Some users from the US ask why the csv reader doesn't read the month-day-year format. The answer is simple: It's not an iso8601 format. The US month-day-year format is a locale that may be used a lot in the US, but it isn't an international standard. If you need to work with MM-DD-YYYY you will find that the file_reader will import the values as text (str). You can then reformat it with a custom function like: In\u00a0[69]: Copied! s = \"03-21-1998\"\nfrom datetime import date\nf = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5]))\nf(s)\n s = \"03-21-1998\" from datetime import date f = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5])) f(s) Out[69]: datetime.date(1998, 3, 21) In\u00a0[70]: Copied! from tablite.import_utils import file_readers\nfor k,v in file_readers.items():\n print(k,v)\n from tablite.import_utils import file_readers for k,v in file_readers.items(): print(k,v) fods <function excel_reader at 0x7f36a3ef8c10>\njson <function excel_reader at 0x7f36a3ef8c10>\nhtml <function from_html at 0x7f36a3ef8b80>\nhdf5 <function from_hdf5 at 0x7f36a3ef8a60>\nsimple <function excel_reader at 0x7f36a3ef8c10>\nrst <function excel_reader at 0x7f36a3ef8c10>\nmediawiki <function excel_reader at 0x7f36a3ef8c10>\nxlsx <function excel_reader at 0x7f36a3ef8c10>\nxls <function excel_reader at 0x7f36a3ef8c10>\nxlsm <function excel_reader at 0x7f36a3ef8c10>\ncsv <function text_reader at 0x7f36a3ef9000>\ntsv <function text_reader at 0x7f36a3ef9000>\ntxt <function text_reader at 0x7f36a3ef9000>\nods <function ods_reader at 0x7f36a3ef8ca0>\n (2) define your new file reader In\u00a0[71]: Copied! def my_magic_reader(path, **kwargs): # define your new file reader.\n print(\"do magic with {path}\")\n return\n def my_magic_reader(path, **kwargs): # define your new file reader. print(\"do magic with {path}\") return (3) add it to the list of readers. In\u00a0[72]: Copied! file_readers['my_special_format'] = my_magic_reader\n file_readers['my_special_format'] = my_magic_reader The file_readers are all in tablite.core so if you intend to extend the readers, I recommend that you start here. In\u00a0[73]: Copied! file = Path('example.xlsx')\ntx2.to_xlsx(file)\nos.remove(file)\n file = Path('example.xlsx') tx2.to_xlsx(file) os.remove(file) In\u00a0[74]: Copied! from tablite import Table\n\nt = Table({\n'a':[1, 2, 8, 3, 4, 6, 5, 7, 9],\n'b':[10, 100, 3, 4, 16, -1, 10, 10, 10],\n})\nt.sort(mapping={\"a\":False})\nt\n from tablite import Table t = Table({ 'a':[1, 2, 8, 3, 4, 6, 5, 7, 9], 'b':[10, 100, 3, 4, 16, -1, 10, 10, 10], }) t.sort(mapping={\"a\":False}) t creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1674.37it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00<00:00, 1701.89it/s]\n Out[74]: #ab 0110 12100 234 3416 4510 56-1 6710 783 8910 In\u00a0[75]: Copied! %pip install matplotlib -q\n %pip install matplotlib -q Note: you may need to restart the kernel to use updated packages.\n In\u00a0[76]: Copied! import matplotlib.pyplot as plt\nplt.plot(t['a'], t['b'])\nplt.ylabel('Hello Figure')\nplt.show()\n import matplotlib.pyplot as plt plt.plot(t['a'], t['b']) plt.ylabel('Hello Figure') plt.show() In\u00a0[77]: Copied! ## Let's monitor the memory and record the observations into a table!\nimport psutil, os, gc\nfrom time import process_time,sleep\nprocess = psutil.Process(os.getpid())\n\ndef mem_time(): # go and check taskmanagers memory usage.\n return process.memory_info().rss, process_time()\n\ndigits = 1_000_000\n\nrecords = Table({'method':[], 'memory':[], 'time':[]})\n ## Let's monitor the memory and record the observations into a table! import psutil, os, gc from time import process_time,sleep process = psutil.Process(os.getpid()) def mem_time(): # go and check taskmanagers memory usage. return process.memory_info().rss, process_time() digits = 1_000_000 records = Table({'method':[], 'memory':[], 'time':[]}) The row based format: 1 million 10-tuples In\u00a0[78]: Copied! before, start = mem_time()\nL = [tuple([11 for _ in range(10)]) for _ in range(digits)]\nafter, end = mem_time() \ndel L\ngc.collect()\n\nrecords.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4)))\nrecords\n before, start = mem_time() L = [tuple([11 for _ in range(10)]) for _ in range(digits)] after, end = mem_time() del L gc.collect() records.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4))) records Out[78]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 The column based format: 10 columns with 1M values: In\u00a0[79]: Copied! before, start = mem_time()\nL = [[11 for i2 in range(digits)] for i1 in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4)))\n before, start = mem_time() L = [[11 for i2 in range(digits)] for i1 in range(10)] after,end = mem_time() del L gc.collect() records.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4))) We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists. Q: But why didn't I just use an array? It would have even lower memory footprint. A: First, array's don't handle None's and we get that frequently in dirty csv data. Second, Table needs even less memory. Let's try with an array: In\u00a0[80]: Copied! import array\n\nbefore, start = mem_time()\nL = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4)))\nrecords\n import array before, start = mem_time() L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)] after,end = mem_time() del L gc.collect() records.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4))) records Out[80]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 Finally let's use a tablite.Table : In\u00a0[81]: Copied! before,start = mem_time()\nt = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)})\nafter,end = mem_time()\n\nrecords.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4)))\n\nbefore,start = mem_time()\nt2 = t.copy()\nafter,end = mem_time()\n\nrecords.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4)))\n\n## Let's show it, so we know nobody's cheating:\nt2\n before,start = mem_time() t = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)}) after,end = mem_time() records.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4))) before,start = mem_time() t2 = t.copy() after,end = mem_time() records.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4))) ## Let's show it, so we know nobody's cheating: t2 Out[81]: #0123456789 011111111111111111111 111111111111111111111 211111111111111111111 311111111111111111111 411111111111111111111 511111111111111111111 611111111111111111111................................. 999,99311111111111111111111 999,99411111111111111111111 999,99511111111111111111111 999,99611111111111111111111 999,99711111111111111111111 999,99811111111111111111111 999,99911111111111111111111 In\u00a0[82]: Copied! records\n records Out[82]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 3Table with 10 columns with 1e6 integers01.9569 42 Tables with 10 columns with 1e6 integers each00.0001 Conclusion: whilst the common worst case (1M lists with 10 integers) take up 118 Mb of RAM, Tablite's tables vanish in the noise of memory measurement. Pandas also permits the usage of namedtuples, which are unpacked upon entry. from collections import namedtuple\nPoint = namedtuple(\"Point\", \"x y\")\npoints = [Point(0, 0), Point(0, 3)]\npd.DataFrame(points)\n Doing that in tablite is a bit different. To unpack the named tuple, you should do so explicitly: t = Table({'x': [p.x for p in points], 'y': [p.y for p in points]})\n However should you want to keep the points as namedtuple, you can do so in tablite: t = Table()\nt['points'] = points\n Tablite will store a serialised version of the points, so your memory overhead will be close to zero. "},{"location":"tutorial/#tablite","title":"Tablite\u00b6","text":""},{"location":"tutorial/#introduction","title":"Introduction\u00b6","text":"Tablite fills the data-science space where incremental data processing based on: - Datasets are larger than memory.
- You don't want to worry about datatypes.
Tablite thereby competes with: - Pandas, but saves the memory overhead.
- Numpy, but spares you from worrying about lower level data types
- SQlite, by sheer speed.
- Polars, by working beyond RAM.
- Other libraries for data cleaning thanks to tablites powerful
datatypes module. Install: pip install tablite Usage: >>> from tablite import Table Upgrade: pip install tablite --no-cache --upgrade "},{"location":"tutorial/#overview","title":"Overview\u00b6","text":"(Version 2023.6.0 and later. For older version see this) - Tablite handles all Python datatypes:
str , float , bool , int , date , datetime , time , timedelta and None . - you can select:
- all rows in a column as
table['A'] - rows across all columns as
table[4:8] - or a slice as
table['A', 'B', slice(4,8) ] . - you to update with
table['A'][2] = new value - you can store or send data using json, by:
- dumping to json:
json_str = table.to_json() , or - you can load it with
Table.from_json(json_str) . - you can iterate over rows using
for row in Table.rows . - you can ask
column_xyz in Table.colums ? - load from files with
new_table = Table.from_file('this.csv') which has automatic datatype detection - perform inner, outer & left sql join between tables as simple as
table_1.inner_join(table2, keys=['A', 'B']) - summarise using
table.groupby( ... ) - create pivot tables using
groupby.pivot( ... ) - perform multi-criteria lookup in tables using
table1.lookup(table2, criteria=..... - and of course a large selection of tools in
from tablite.tools import * "},{"location":"tutorial/#examples","title":"Examples\u00b6","text":"Here are some examples: "},{"location":"tutorial/#api-examples","title":"API Examples\u00b6","text":"In the following sections, example are given of the Tablite API's power features: - Iteration
- Append
- Sort
- Filter
- Index
- Search All
- Search Any
- Lookup
- Join inner, outer,
- GroupBy
- Pivot table
"},{"location":"tutorial/#iteration","title":"ITERATION!\u00b6","text":"Iteration supports for loops and list comprehension at the speed of light: Just use [r for r in table.rows] , or: for row in table.rows:\n row ... Here's a more practical use case: (1) Imagine a table with columns a,b,c,d,e (all integers) like this: "},{"location":"tutorial/#create-index-indices","title":"Create Index / Indices\u00b6","text":"Index supports multi-key indexing using args such as: index = table.index('B','C') . Here's an example: "},{"location":"tutorial/#append","title":"APPEND\u00b6","text":""},{"location":"tutorial/#save","title":"SAVE\u00b6","text":""},{"location":"tutorial/#filter","title":"FILTER!\u00b6","text":""},{"location":"tutorial/#any-all","title":"Any! All?\u00b6","text":"Any and All are cousins of the filter. They're there so you can use them in the same way as you'd use any and all in python - as boolean evaluators: "},{"location":"tutorial/#sort","title":"SORT!\u00b6","text":""},{"location":"tutorial/#groupby","title":"GROUPBY !\u00b6","text":""},{"location":"tutorial/#did-i-say-pivot-table-yes","title":"Did I say pivot table? Yes.\u00b6","text":"Pivot Table is included in the groupby functionality - so yes - you can pivot the groupby on any column that is used for grouping. Here's a simple example: "},{"location":"tutorial/#join","title":"JOIN!\u00b6","text":""},{"location":"tutorial/#lookup","title":"LOOKUP!\u00b6","text":""},{"location":"tutorial/#match","title":"Match\u00b6","text":"If you're looking to do a join where you afterwards remove the empty rows, match is the faster choice. Here is an example. Let's start with two tables: "},{"location":"tutorial/#are-there-other-ways-i-can-add-data","title":"Are there other ways I can add data?\u00b6","text":"Yes - but row based operations cause a lot of IO, so it'll work but be slower: "},{"location":"tutorial/#okay-great-how-do-i-load-data","title":"Okay, great. How do I load data?\u00b6","text":"Easy. Use file_reader . Here's an example: "},{"location":"tutorial/#sweet-what-formats-are-supported-can-i-add-my-own-file-reader","title":"Sweet. What formats are supported? Can I add my own file reader?\u00b6","text":"Yes! This is very good for special log files or custom json formats. Here's how you do it: (1) Go to all existing readers in the tablite.core and find the closest match. "},{"location":"tutorial/#very-nice-how-about-exporting-data","title":"Very nice. How about exporting data?\u00b6","text":"Just use .export "},{"location":"tutorial/#cool-does-it-play-well-with-plotting-packages","title":"Cool. Does it play well with plotting packages?\u00b6","text":"Yes. Here's an example you can copy and paste: "},{"location":"tutorial/#i-like-sql-can-tablite-understand-sql","title":"I like sql. Can tablite understand SQL?\u00b6","text":"Almost. You can use table.to_sql and tablite will return ANSI-92 compliant SQL. You can also create a table using Table.from_sql and tablite will consume ANSI-92 compliant SQL. "},{"location":"tutorial/#but-what-do-i-do-if-im-about-to-run-out-of-memory","title":"But what do I do if I'm about to run out of memory?\u00b6","text":"You wont. Every tablite table is backed by disk. The memory footprint of a table is only the metadata required to know the relationships between variable names and the datastructures. Let's do a comparison: "},{"location":"tutorial/#conclusions","title":"Conclusions\u00b6","text":"This concludes the mega-tutorial to tablite . There's nothing more to it. But oh boy it'll save a lot of time. Here's a summary of features: - Everything a list can do.
- import csv*, fods, json, html, simple, rst, mediawiki, xlsx, xls, xlsm, csv, tsv, txt, ods using
Table.from_file(...) - Iterate over rows or columns
- Create multikey
index , sort , use filter , any and all to select. Perform lookup across tables including using custom functions. - Perform multikey
joins with other tables. - Perform
groupby and reorganise data as a pivot table with max, min, sum, first, last, count, unique, average, standard deviation, median and mode. - Update tables with
+= which automatically sorts out the columns - even if they're not in perfect order. "},{"location":"tutorial/#faq","title":"FAQ\u00b6","text":"Question Answer I'm not in a notebook. Is there a nice way to view tables? Yes. table.show() prints the ascii version I'm looking for the equivalent to apply in pandas. Just use list comprehensions: table[column] = [f(x) for x in table[column] What about map ? Just use the python function: mapping = map(f, table[column name]) Is there a where function? It's called any or all like in python: table.any(column_name > 0) . I like sql and sqlite. Can I use sql? Yes. Call table.to_sql() returns ANSI-92 SQL compliant table definition.You can use this in any SQL compliant engine. | sometimes i need to clean up data with datetimes. Is there any tool to help with that? | Yes. Look at DataTypes.DataTypes.round(value, multiple) allows rounding of datetime. "},{"location":"tutorial/#coming-to-tablite-from-pandas","title":"Coming to Tablite from Pandas\u00b6","text":"If you're coming to Tablite from Pandas you will notice some differences. Here's the ultra short comparison to the documentation from Pandas called 10 minutes intro to pandas The tutorials provide the generic overview: - pandas tutorial
- tablite tutorial
Some key differences topic Tablite Viewing data Just use table.show() in print outs, or if you're in a jupyter notebook just use the variable name table Selection Slicing works both on columns and rows, and you can filter using any or all :table['A','B', 2:30:3].any(A=lambda x:x>3) to copy a table use: t2 = t.copy() This is a very fast deep copy, that has no memory overhead as tablites memory manager keeps track of the data. Missing data Tablite uses mixed column format for any format that isn't uniformTo get rid of rows with None s and np.nan s use any:table.drop_na(None, np.nan) Alternatively you can use replace: table.replace(None,5) following the syntax: table.replace_missing_values(sources, target) Operations Descriptive statistics are on a colum by column basis:table['a'].statistics() the pandas function df.apply doesn't exist in tablite. Use a list comprehension instead. For example: df.apply(np.cumsum) is just np.cumsum(t['A']) \"histogramming\" in tablite is per column: table['a'].histogram() string methods? Just use a list comprehensions: table['A', 'B'].any(A=lambda x: \"hello\" in x, B=lambda x: \"world\" in x) Merge Concatenation: Just use + or += as in t1 = t2 + t3 += t4 . If the columns are out of order, tablite will sort the headers according to the order in the first table.If you're worried that the header mismatch use t1.stack(t2) Joins are ANSI92 compliant: t1.join(t2, <...args...>, join_type=...) . Grouping Tablite supports multikey groupby using from tablite import Groupby as gb . table.groupby(keys, functions) Reshaping To reshape a table use transpose . to perform pivot table like operations, use: table.pivot(rows, columns, functions) subtotals aside tablite will give you everything Excels pivot table can do. Time series To convert time series use a list comprehension.t1['GMT'] = [timedelta(hours=1) + v for v in t1['date'] ] to generate a date range use:from Tablite import daterange t['date'] = date_range(start=2022/1/1, stop=2023/1/1, step=timedelta(days=1)) Categorical Pandas only seems to use this for sorting and grouping. Tablite table has .sort , .groupby and .pivot to achieve the same task. Plotting Import your favorite plotting package and feed it the values, such as:import matplotlib.pyplot as plt plt.plot(t['a'],t['b']) plt.showw() Import/Export Tablite supports the same import/export options as pandas.Tablite pegs the free memory before IO and can therefore process larger-than-RAM files. Tablite also guesses the datatypes for all ISOformats and uses multiprocessing and may therefore be faster. Should you want to inspect how guess works, use from tools import guess and try the function out. Gotchas None really. Should you come across something non-pythonic, then please post it on the issue list."},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#tablite.base","title":"tablite.base ","text":""},{"location":"reference/base/#tablite.base-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.log","title":"tablite.base.log = logging.getLogger(__name__) module-attribute ","text":""},{"location":"reference/base/#tablite.base.file_registry","title":"tablite.base.file_registry = set() module-attribute ","text":""},{"location":"reference/base/#tablite.base-classes","title":"Classes","text":""},{"location":"reference/base/#tablite.base.SimplePage","title":"tablite.base.SimplePage(id, path, len, py_dtype) ","text":" Bases: object Source code in tablite/base.py def __init__(self, id, path, len, py_dtype) -> None:\n self.path = Path(path) / \"pages\" / f\"{id}.npy\"\n self.len = len\n self.dtype = py_dtype\n\n self._incr_refcount()\n "},{"location":"reference/base/#tablite.base.SimplePage-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.SimplePage.ids","title":"tablite.base.SimplePage.ids = count(start=1) class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.refcounts","title":"tablite.base.SimplePage.refcounts = {} class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.autocleanup","title":"tablite.base.SimplePage.autocleanup = True class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.path","title":"tablite.base.SimplePage.path = Path(path) / 'pages' / f'{id}.npy' instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.len","title":"tablite.base.SimplePage.len = len instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage.dtype","title":"tablite.base.SimplePage.dtype = py_dtype instance-attribute ","text":""},{"location":"reference/base/#tablite.base.SimplePage-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.SimplePage.__setstate__","title":"tablite.base.SimplePage.__setstate__(state) ","text":"when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called Source code in tablite/base.py def __setstate__(self, state):\n \"\"\"\n when an object is unpickled, say in a case of multi-processing,\n object.__setstate__(state) is called instead of __init__, this means\n we need to update page refcount as if constructor had been called\n \"\"\"\n self.__dict__.update(state)\n\n self._incr_refcount()\n "},{"location":"reference/base/#tablite.base.SimplePage.next_id","title":"tablite.base.SimplePage.next_id(path) classmethod ","text":"Source code in tablite/base.py @classmethod\ndef next_id(cls, path):\n path = Path(path)\n\n while True:\n _id = f\"{os.getpid()}-{next(cls.ids)}\"\n _path = path / \"pages\" / f\"{_id}.npy\"\n\n if not _path.exists():\n break # make sure we don't override existing pages if they are created outside of main thread\n\n return _id\n "},{"location":"reference/base/#tablite.base.SimplePage.__len__","title":"tablite.base.SimplePage.__len__() ","text":"Source code in tablite/base.py def __len__(self):\n return self.len\n "},{"location":"reference/base/#tablite.base.SimplePage.__repr__","title":"tablite.base.SimplePage.__repr__() -> str ","text":"Source code in tablite/base.py def __repr__(self) -> str:\n try:\n return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n except FileNotFoundError as e:\n return f\"{self.__class__.__name__}({self.path}, <{type(e).__name__}>)\"\n except Exception as e:\n return f\"{self.__class__.__name__}({self.path}, <{e}>)\"\n "},{"location":"reference/base/#tablite.base.SimplePage.__hash__","title":"tablite.base.SimplePage.__hash__() -> int ","text":"Source code in tablite/base.py def __hash__(self) -> int:\n return hash(self.path)\n "},{"location":"reference/base/#tablite.base.SimplePage.owns","title":"tablite.base.SimplePage.owns() ","text":"Source code in tablite/base.py def owns(self):\n parts = self.path.parts\n\n return all((p in parts for p in Path(Config.pid).parts))\n "},{"location":"reference/base/#tablite.base.SimplePage.__del__","title":"tablite.base.SimplePage.__del__() ","text":"When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data. Source code in tablite/base.py def __del__(self):\n \"\"\"When python's reference count for an object is 0, python uses\n it's garbage collector to remove the object and free the memory.\n As tablite tables have columns and columns have page and pages have\n data stored on disk, the space on disk must be freed up as well.\n This __del__ override assures the cleanup of stored data.\n \"\"\"\n if not self.owns():\n return\n\n refcount = self.refcounts[self.path] = max(\n self.refcounts.get(self.path, 0) - 1, 0\n )\n\n if refcount > 0:\n return\n\n if self.autocleanup:\n self.path.unlink(True)\n\n del self.refcounts[self.path]\n "},{"location":"reference/base/#tablite.base.SimplePage.get","title":"tablite.base.SimplePage.get() ","text":"loads stored data RETURNS DESCRIPTION np.ndarray: stored data. Source code in tablite/base.py def get(self):\n \"\"\"loads stored data\n\n Returns:\n np.ndarray: stored data.\n \"\"\"\n array = load_numpy(self.path)\n return MetaArray(array, array.dtype, py_dtype=self.dtype)\n "},{"location":"reference/base/#tablite.base.Page","title":"tablite.base.Page(path, array) ","text":" Bases: SimplePage PARAMETER DESCRIPTION path working directory. TYPE: Path array data TYPE: array Source code in tablite/base.py def __init__(self, path, array) -> None:\n \"\"\"\n Args:\n path (Path): working directory.\n array (np.array): data\n \"\"\"\n _id = self.next_id(path)\n\n type_check(array, np.ndarray)\n\n if Config.DISK_LIMIT <= 0:\n pass\n else:\n _, _, free = shutil.disk_usage(path)\n if free - array.nbytes < Config.DISK_LIMIT:\n msg = \"\\n\".join(\n [\n f\"Disk limit reached: Config.DISK_LIMIT = {Config.DISK_LIMIT:,} bytes.\",\n f\"array requires {array.nbytes:,} bytes, but only {free:,} bytes are free.\",\n \"To disable this check, use:\",\n \">>> from tablite.config import Config\",\n \">>> Config.DISK_LIMIT = 0\",\n \"To free space, clean up Config.workdir:\",\n f\"{Config.workdir}\",\n ]\n )\n raise OSError(msg)\n\n _len = len(array)\n # type_check(array, MetaArray)\n if not hasattr(array, \"metadata\"):\n raise ValueError\n _dtype = array.metadata[\"py_dtype\"]\n\n super().__init__(_id, path, _len, _dtype)\n\n np.save(self.path, array, allow_pickle=True, fix_imports=False)\n log.debug(f\"Page saved: {self.path}\")\n "},{"location":"reference/base/#tablite.base.Page-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Page.ids","title":"tablite.base.Page.ids = count(start=1) class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.refcounts","title":"tablite.base.Page.refcounts = {} class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.autocleanup","title":"tablite.base.Page.autocleanup = True class-attribute instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.path","title":"tablite.base.Page.path = Path(path) / 'pages' / f'{id}.npy' instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.len","title":"tablite.base.Page.len = len instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page.dtype","title":"tablite.base.Page.dtype = py_dtype instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Page-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Page.__setstate__","title":"tablite.base.Page.__setstate__(state) ","text":"when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called Source code in tablite/base.py def __setstate__(self, state):\n \"\"\"\n when an object is unpickled, say in a case of multi-processing,\n object.__setstate__(state) is called instead of __init__, this means\n we need to update page refcount as if constructor had been called\n \"\"\"\n self.__dict__.update(state)\n\n self._incr_refcount()\n "},{"location":"reference/base/#tablite.base.Page.next_id","title":"tablite.base.Page.next_id(path) classmethod ","text":"Source code in tablite/base.py @classmethod\ndef next_id(cls, path):\n path = Path(path)\n\n while True:\n _id = f\"{os.getpid()}-{next(cls.ids)}\"\n _path = path / \"pages\" / f\"{_id}.npy\"\n\n if not _path.exists():\n break # make sure we don't override existing pages if they are created outside of main thread\n\n return _id\n "},{"location":"reference/base/#tablite.base.Page.__len__","title":"tablite.base.Page.__len__() ","text":"Source code in tablite/base.py def __len__(self):\n return self.len\n "},{"location":"reference/base/#tablite.base.Page.__repr__","title":"tablite.base.Page.__repr__() -> str ","text":"Source code in tablite/base.py def __repr__(self) -> str:\n try:\n return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n except FileNotFoundError as e:\n return f\"{self.__class__.__name__}({self.path}, <{type(e).__name__}>)\"\n except Exception as e:\n return f\"{self.__class__.__name__}({self.path}, <{e}>)\"\n "},{"location":"reference/base/#tablite.base.Page.__hash__","title":"tablite.base.Page.__hash__() -> int ","text":"Source code in tablite/base.py def __hash__(self) -> int:\n return hash(self.path)\n "},{"location":"reference/base/#tablite.base.Page.owns","title":"tablite.base.Page.owns() ","text":"Source code in tablite/base.py def owns(self):\n parts = self.path.parts\n\n return all((p in parts for p in Path(Config.pid).parts))\n "},{"location":"reference/base/#tablite.base.Page.__del__","title":"tablite.base.Page.__del__() ","text":"When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data. Source code in tablite/base.py def __del__(self):\n \"\"\"When python's reference count for an object is 0, python uses\n it's garbage collector to remove the object and free the memory.\n As tablite tables have columns and columns have page and pages have\n data stored on disk, the space on disk must be freed up as well.\n This __del__ override assures the cleanup of stored data.\n \"\"\"\n if not self.owns():\n return\n\n refcount = self.refcounts[self.path] = max(\n self.refcounts.get(self.path, 0) - 1, 0\n )\n\n if refcount > 0:\n return\n\n if self.autocleanup:\n self.path.unlink(True)\n\n del self.refcounts[self.path]\n "},{"location":"reference/base/#tablite.base.Page.get","title":"tablite.base.Page.get() ","text":"loads stored data RETURNS DESCRIPTION np.ndarray: stored data. Source code in tablite/base.py def get(self):\n \"\"\"loads stored data\n\n Returns:\n np.ndarray: stored data.\n \"\"\"\n array = load_numpy(self.path)\n return MetaArray(array, array.dtype, py_dtype=self.dtype)\n "},{"location":"reference/base/#tablite.base.Column","title":"tablite.base.Column(path, value=None) ","text":" Bases: object Create Column PARAMETER DESCRIPTION path path of table.yml (defaults: Config.pid_dir) TYPE: Path value Data to store. Defaults to None. TYPE: Iterable DEFAULT: None Source code in tablite/base.py def __init__(self, path, value=None) -> None:\n \"\"\"Create Column\n\n Args:\n path (Path): path of table.yml (defaults: Config.pid_dir)\n value (Iterable, optional): Data to store. Defaults to None.\n \"\"\"\n self.path = path\n self.pages = [] # keeps pointers to instances of Page\n if value is not None:\n self.extend(value)\n "},{"location":"reference/base/#tablite.base.Column-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Column.path","title":"tablite.base.Column.path = path instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Column.pages","title":"tablite.base.Column.pages = [] instance-attribute ","text":""},{"location":"reference/base/#tablite.base.Column-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Column.__len__","title":"tablite.base.Column.__len__() ","text":"Source code in tablite/base.py def __len__(self):\n return sum(len(p) for p in self.pages)\n "},{"location":"reference/base/#tablite.base.Column.__repr__","title":"tablite.base.Column.__repr__() ","text":"Source code in tablite/base.py def __repr__(self):\n return f\"{self.__class__.__name__}({self.path}, {self[:]})\"\n "},{"location":"reference/base/#tablite.base.Column.repaginate","title":"tablite.base.Column.repaginate() ","text":"resizes pages to Config.PAGE_SIZE Source code in tablite/base.py def repaginate(self):\n \"\"\"resizes pages to Config.PAGE_SIZE\"\"\"\n from tablite.nimlite import repaginate as _repaginate\n\n _repaginate(self)\n "},{"location":"reference/base/#tablite.base.Column.extend","title":"tablite.base.Column.extend(value) ","text":"extends the column. PARAMETER DESCRIPTION value data TYPE: ndarray Source code in tablite/base.py def extend(self, value): # USER FUNCTION.\n \"\"\"extends the column.\n\n Args:\n value (np.ndarray): data\n \"\"\"\n if isinstance(value, Column):\n self.pages.extend(value.pages[:])\n return\n elif isinstance(value, np.ndarray):\n pass\n elif isinstance(value, (list, tuple)):\n value = list_to_np_array(value)\n else:\n raise TypeError(f\"Cannot extend Column with {type(value)}\")\n type_check(value, np.ndarray)\n for array in self._paginate(value):\n self.pages.append(Page(path=self.path, array=array))\n "},{"location":"reference/base/#tablite.base.Column.clear","title":"tablite.base.Column.clear() ","text":"clears the column. Like list().clear() Source code in tablite/base.py def clear(self):\n \"\"\"\n clears the column. Like list().clear()\n \"\"\"\n self.pages.clear()\n "},{"location":"reference/base/#tablite.base.Column.getpages","title":"tablite.base.Column.getpages(item) ","text":"public non-user function to identify any pages + slices of data to be retrieved given a slice (item) PARAMETER DESCRIPTION item target slice of data TYPE: (int, slice) RETURNS DESCRIPTION list of pages/np.ndarrays. Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)] This helps, for example when creating a copy, as the copy can reference the pages 1 and 2 and only need to store the np.ndarray that is unique to it. Source code in tablite/base.py def getpages(self, item):\n \"\"\"public non-user function to identify any pages + slices\n of data to be retrieved given a slice (item)\n\n Args:\n item (int,slice): target slice of data\n\n Returns:\n list of pages/np.ndarrays.\n\n Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)]\n This helps, for example when creating a copy, as the copy\n can reference the pages 1 and 2 and only need to store\n the np.ndarray that is unique to it.\n \"\"\"\n # internal function\n if isinstance(item, int):\n if item < 0:\n item = len(self) + item\n item = slice(item, item + 1, 1)\n\n type_check(item, slice)\n is_reversed = False if (item.step is None or item.step > 0) else True\n\n length = len(self)\n scan_item = slice(*item.indices(length))\n range_item = range(*item.indices(length))\n\n pages = []\n start, end = 0, 0\n for page in self.pages:\n start, end = end, end + page.len\n if is_reversed:\n if start > scan_item.start:\n break\n if end < scan_item.stop:\n continue\n else:\n if start > scan_item.stop:\n break\n if end < scan_item.start:\n continue\n ro = intercept(range(start, end), range_item)\n if len(ro) == 0:\n continue\n elif len(ro) == page.len: # share the whole immutable page\n pages.append(page)\n else: # fetch the slice and filter it.\n search_slice = slice(ro.start - start, ro.stop - start, ro.step)\n np_arr = load_numpy(page.path)\n match = np_arr[search_slice]\n pages.append(match)\n\n if is_reversed:\n pages.reverse()\n for ix, page in enumerate(pages):\n if isinstance(page, SimplePage):\n data = page.get()\n pages[ix] = np.flip(data)\n else:\n pages[ix] = np.flip(page)\n\n return pages\n "},{"location":"reference/base/#tablite.base.Column.iter_by_page","title":"tablite.base.Column.iter_by_page() ","text":"iterates over the column, page by page. This method minimizes the number of reads. RETURNS DESCRIPTION generator of tuple: start: int end: int data: np.ndarray Source code in tablite/base.py def iter_by_page(self):\n \"\"\"iterates over the column, page by page.\n This method minimizes the number of reads.\n\n Returns:\n generator of tuple:\n start: int\n end: int\n data: np.ndarray\n \"\"\"\n start, end = 0, 0\n for page in self.pages:\n start, end = end, end + page.len\n yield start, end, page\n "},{"location":"reference/base/#tablite.base.Column.__getitem__","title":"tablite.base.Column.__getitem__(item) ","text":"gets numpy array. PARAMETER DESCRIPTION item slice of column TYPE: int OR slice RETURNS DESCRIPTION np.ndarray: results as numpy array. Remember: >>> R = np.array([0,1,2,3,4,5])\n>>> R[3]\n3\n>>> R[3:4]\narray([3])\n Source code in tablite/base.py def __getitem__(self, item): # USER FUNCTION.\n \"\"\"gets numpy array.\n\n Args:\n item (int OR slice): slice of column\n\n Returns:\n np.ndarray: results as numpy array.\n\n Remember:\n ```\n >>> R = np.array([0,1,2,3,4,5])\n >>> R[3]\n 3\n >>> R[3:4]\n array([3])\n ```\n \"\"\"\n result = []\n for element in self.getpages(item):\n if isinstance(element, SimplePage):\n result.append(element.get())\n else:\n result.append(element)\n\n if result:\n arr = np_type_unify(result)\n else:\n arr = np.array([])\n\n if isinstance(item, int):\n if len(arr) == 0:\n raise IndexError(\n f\"index {item} is out of bounds for axis 0 with size {len(self)}\"\n )\n return numpy_to_python(arr[0])\n else:\n return arr\n "},{"location":"reference/base/#tablite.base.Column.__setitem__","title":"tablite.base.Column.__setitem__(key, value) ","text":"sets values. PARAMETER DESCRIPTION key selector TYPE: (int, slice) value values to insert TYPE: any RAISES DESCRIPTION KeyError Following normal slicing rules Source code in tablite/base.py def __setitem__(self, key, value): # USER FUNCTION.\n \"\"\"sets values.\n\n Args:\n key (int,slice): selector\n value (any): values to insert\n\n Raises:\n KeyError: Following normal slicing rules\n \"\"\"\n if isinstance(key, int):\n self._setitem_integer_key(key, value)\n\n elif isinstance(key, slice):\n if not isinstance(value, np.ndarray):\n value = list_to_np_array(value)\n type_check(value, np.ndarray)\n\n if key.start is None and key.stop is None and key.step in (None, 1):\n self._setitem_replace_all(key, value)\n elif key.start is not None and key.stop is None and key.step in (None, 1):\n self._setitem_extend(key, value)\n elif key.stop is not None and key.start is None and key.step in (None, 1):\n self._setitem_prextend(key, value)\n elif (\n key.step in (None, 1) and key.start is not None and key.stop is not None\n ):\n self._setitem_insert(key, value)\n elif key.step not in (None, 1):\n self._setitem_update(key, value)\n else:\n raise KeyError(f\"bad key: {key}\")\n else:\n raise KeyError(f\"bad key: {key}\")\n "},{"location":"reference/base/#tablite.base.Column.__delitem__","title":"tablite.base.Column.__delitem__(key) ","text":"deletes items selected by key PARAMETER DESCRIPTION key selector TYPE: (int, slice) RAISES DESCRIPTION KeyError following normal slicing rules. Source code in tablite/base.py def __delitem__(self, key): # USER FUNCTION\n \"\"\"deletes items selected by key\n\n Args:\n key (int,slice): selector\n\n Raises:\n KeyError: following normal slicing rules.\n \"\"\"\n if isinstance(key, int):\n self._del_by_int(key)\n elif isinstance(key, slice):\n self._del_by_slice(key)\n else:\n raise KeyError(f\"bad key: {key}\")\n "},{"location":"reference/base/#tablite.base.Column.get_by_indices","title":"tablite.base.Column.get_by_indices(indices: Union[List[int], np.ndarray]) -> np.ndarray ","text":"retrieves values from column given a set of indices. PARAMETER DESCRIPTION indices targets TYPE: array This method uses np.take, is faster than iterating over rows. Examples: >>> indices = np.array(list(range(3,700_700, 426)))\n>>> arr = np.array(list(range(2_000_000)))\nPythonic:\n>>> [v for i,v in enumerate(arr) if i in indices]\nNumpyionic:\n>>> np.take(arr, indices)\n Source code in tablite/base.py def get_by_indices(self, indices: Union[List[int], np.ndarray]) -> np.ndarray:\n \"\"\"retrieves values from column given a set of indices.\n\n Args:\n indices (np.array): targets\n\n This method uses np.take, is faster than iterating over rows.\n Examples:\n ```\n >>> indices = np.array(list(range(3,700_700, 426)))\n >>> arr = np.array(list(range(2_000_000)))\n Pythonic:\n >>> [v for i,v in enumerate(arr) if i in indices]\n Numpyionic:\n >>> np.take(arr, indices)\n ```\n \"\"\"\n type_check(indices, np.ndarray)\n\n dtypes = set()\n values = np.empty(\n indices.shape, dtype=object\n ) # placeholder for the indexed values.\n\n for start, end, page in self.iter_by_page():\n range_match = np.asarray(((indices >= start) & (indices < end)) | (indices == -1)).nonzero()[0]\n if len(range_match):\n # only fetch the data if there's a range match!\n data = page.get() \n sub_index = np.take(indices, range_match)\n # sub_index2 otherwise will raise index error where len(data) > (-1 - start)\n # so the clause below is required:\n if len(data) > (-1 - start):\n sub_index = np.where(sub_index == -1, -1, sub_index - start)\n arr = np.take(data, sub_index)\n dtypes.add(arr.dtype)\n np.put(values, range_match, arr)\n\n if len(dtypes) == 1: # simplify the datatype\n dtype = next(iter(dtypes))\n values = np.array(values, dtype=dtype)\n return values\n "},{"location":"reference/base/#tablite.base.Column.__iter__","title":"tablite.base.Column.__iter__() ","text":"Source code in tablite/base.py def __iter__(self): # USER FUNCTION.\n for page in self.pages:\n data = page.get()\n for value in data:\n yield value\n "},{"location":"reference/base/#tablite.base.Column.__eq__","title":"tablite.base.Column.__eq__(other) ","text":"compares two columns. Like list1 == list2 Source code in tablite/base.py def __eq__(self, other): # USER FUNCTION.\n \"\"\"\n compares two columns. Like `list1 == list2`\n \"\"\"\n if len(self) != len(other): # quick cheap check.\n return False\n\n if isinstance(other, (list, tuple)):\n return all(a == b for a, b in zip(self[:], other))\n\n elif isinstance(other, Column):\n if self.pages == other.pages: # special case.\n return True\n\n # are the pages of same size?\n if len(self.pages) == len(other.pages):\n if [p.len for p in self.pages] == [p.len for p in other.pages]:\n for a, b in zip(self.pages, other.pages):\n if not (a.get() == b.get()).all():\n return False\n return True\n # to bad. Element comparison it is then:\n for a, b in zip(iter(self), iter(other)):\n if a != b:\n return False\n return True\n\n elif isinstance(other, np.ndarray):\n start, end = 0, 0\n for p in self.pages:\n start, end = end, end + p.len\n if not (p.get() == other[start:end]).all():\n return False\n return True\n else:\n raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n "},{"location":"reference/base/#tablite.base.Column.__ne__","title":"tablite.base.Column.__ne__(other) ","text":"compares two columns. Like list1 != list2 Source code in tablite/base.py def __ne__(self, other): # USER FUNCTION\n \"\"\"\n compares two columns. Like `list1 != list2`\n \"\"\"\n if len(self) != len(other): # quick cheap check.\n return True\n\n if isinstance(other, (list, tuple)):\n return any(a != b for a, b in zip(self[:], other))\n\n elif isinstance(other, Column):\n if self.pages == other.pages: # special case.\n return False\n\n # are the pages of same size?\n if len(self.pages) == len(other.pages):\n if [p.len for p in self.pages] == [p.len for p in other.pages]:\n for a, b in zip(self.pages, other.pages):\n if not (a.get() == b.get()).all():\n return True\n return False\n # to bad. Element comparison it is then:\n for a, b in zip(iter(self), iter(other)):\n if a != b:\n return True\n return False\n\n elif isinstance(other, np.ndarray):\n start, end = 0, 0\n for p in self.pages:\n start, end = end, end + p.len\n if (p.get() != other[start:end]).any():\n return True\n return False\n else:\n raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n "},{"location":"reference/base/#tablite.base.Column.copy","title":"tablite.base.Column.copy() ","text":"returns deep=copy of Column RETURNS DESCRIPTION Column Source code in tablite/base.py def copy(self):\n \"\"\"returns deep=copy of Column\n\n Returns:\n Column\n \"\"\"\n cp = Column(path=self.path)\n cp.pages = self.pages[:]\n return cp\n "},{"location":"reference/base/#tablite.base.Column.__copy__","title":"tablite.base.Column.__copy__() ","text":"see copy Source code in tablite/base.py def __copy__(self):\n \"\"\"see copy\"\"\"\n return self.copy()\n "},{"location":"reference/base/#tablite.base.Column.__imul__","title":"tablite.base.Column.__imul__(other) ","text":"Repeats instance of column N times. Like list() * N Example: >>> one = Column(data=[1,2])\n>>> one *= 5\n>>> one\n[1,2, 1,2, 1,2, 1,2, 1,2]\n Source code in tablite/base.py def __imul__(self, other):\n \"\"\"\n Repeats instance of column N times. Like list() * N\n\n Example:\n ```\n >>> one = Column(data=[1,2])\n >>> one *= 5\n >>> one\n [1,2, 1,2, 1,2, 1,2, 1,2]\n ```\n \"\"\"\n if not (isinstance(other, int) and other > 0):\n raise TypeError(\n f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n )\n self.pages = self.pages[:] * other\n return self\n "},{"location":"reference/base/#tablite.base.Column.__mul__","title":"tablite.base.Column.__mul__(other) ","text":"Repeats instance of column N times. Like list() * N Example: >>> one = Column(data=[1,2])\n>>> two = one * 5\n>>> two\n[1,2, 1,2, 1,2, 1,2, 1,2]\n Source code in tablite/base.py def __mul__(self, other):\n \"\"\"\n Repeats instance of column N times. Like list() * N\n\n Example:\n ```\n >>> one = Column(data=[1,2])\n >>> two = one * 5\n >>> two\n [1,2, 1,2, 1,2, 1,2, 1,2]\n ```\n \"\"\"\n if not isinstance(other, int):\n raise TypeError(\n f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n )\n cp = self.copy()\n cp *= other\n return cp\n "},{"location":"reference/base/#tablite.base.Column.__iadd__","title":"tablite.base.Column.__iadd__(other) ","text":"Source code in tablite/base.py def __iadd__(self, other):\n if isinstance(other, (list, tuple)):\n other = list_to_np_array(other)\n self.extend(other)\n elif isinstance(other, Column):\n self.pages.extend(other.pages[:])\n else:\n raise TypeError(f\"{type(other)} not supported.\")\n return self\n "},{"location":"reference/base/#tablite.base.Column.__contains__","title":"tablite.base.Column.__contains__(item) ","text":"determines if item is in the Column. Similar to 'x' in ['a','b','c'] returns boolean PARAMETER DESCRIPTION item value to search for TYPE: any RETURNS DESCRIPTION bool True if item exists in column. Source code in tablite/base.py def __contains__(self, item):\n \"\"\"determines if item is in the Column.\n Similar to `'x' in ['a','b','c']`\n returns boolean\n\n Args:\n item (any): value to search for\n\n Returns:\n bool: True if item exists in column.\n \"\"\"\n for page in set(self.pages):\n if item in page.get(): # x in np.ndarray([...]) uses np.any(arr, value)\n return True\n return False\n "},{"location":"reference/base/#tablite.base.Column.remove_all","title":"tablite.base.Column.remove_all(*values) ","text":"removes all values of values Source code in tablite/base.py def remove_all(self, *values):\n \"\"\"\n removes all values of `values`\n \"\"\"\n type_check(values, tuple)\n if isinstance(values[0], tuple):\n values = values[0]\n to_remove = list_to_np_array(values)\n for index, page in enumerate(self.pages):\n data = page.get()\n bitmask = np.isin(data, to_remove) # identify elements to remove.\n if bitmask.any():\n bitmask = np.invert(bitmask) # turn bitmask around to keep.\n new_data = np.compress(bitmask, data)\n new_page = Page(self.path, new_data)\n self.pages[index] = new_page\n "},{"location":"reference/base/#tablite.base.Column.replace","title":"tablite.base.Column.replace(mapping) ","text":"replaces values using a mapping. PARAMETER DESCRIPTION mapping {value to replace: new value, ...} TYPE: dict Example: >>> t = Table(columns={'A': [1,2,3,4]})\n>>> t['A'].replace({2:20,4:40})\n>>> t[:]\nnp.ndarray([1,20,3,40])\n Source code in tablite/base.py def replace(self, mapping):\n \"\"\"\n replaces values using a mapping.\n\n Args:\n mapping (dict): {value to replace: new value, ...}\n\n Example:\n ```\n >>> t = Table(columns={'A': [1,2,3,4]})\n >>> t['A'].replace({2:20,4:40})\n >>> t[:]\n np.ndarray([1,20,3,40])\n ```\n \"\"\"\n type_check(mapping, dict)\n to_replace = np.array(list(mapping.keys()))\n for index, page in enumerate(self.pages):\n data = page.get()\n bitmask = np.isin(data, to_replace) # identify elements to replace.\n if bitmask.any():\n warray = np.compress(bitmask, data)\n py_dtype = page.dtype\n for ix, v in enumerate(warray):\n old_py_val = numpy_to_python(v)\n new_py_val = mapping[old_py_val]\n old_dt = type(old_py_val)\n new_dt = type(new_py_val)\n\n warray[ix] = new_py_val\n\n py_dtype[new_dt] = py_dtype.get(new_dt, 0) + 1\n py_dtype[old_dt] = py_dtype.get(old_dt, 0) - 1\n\n if py_dtype[old_dt] <= 0:\n del py_dtype[old_dt]\n\n data[bitmask] = warray\n self.pages[index] = Page(path=self.path, array=data)\n "},{"location":"reference/base/#tablite.base.Column.types","title":"tablite.base.Column.types() ","text":"returns dict with python datatypes RETURNS DESCRIPTION dict frequency of occurrence of python datatypes Source code in tablite/base.py def types(self):\n \"\"\"\n returns dict with python datatypes\n\n Returns:\n dict: frequency of occurrence of python datatypes\n \"\"\"\n d = Counter()\n for page in self.pages:\n assert isinstance(page.dtype, dict)\n d += page.dtype\n return dict(d)\n "},{"location":"reference/base/#tablite.base.Column.index","title":"tablite.base.Column.index() ","text":"returns dict with { unique entry : list of indices } example: >>> c = Column(data=['a','b','a','c','b'])\n>>> c.index()\n{'a':[0,2], 'b': [1,4], 'c': [3]}\n Source code in tablite/base.py def index(self):\n \"\"\"\n returns dict with { unique entry : list of indices }\n\n example:\n ```\n >>> c = Column(data=['a','b','a','c','b'])\n >>> c.index()\n {'a':[0,2], 'b': [1,4], 'c': [3]}\n ```\n \"\"\"\n d = defaultdict(list)\n for ix, v in enumerate(self.__iter__()):\n d[v].append(ix)\n return dict(d)\n "},{"location":"reference/base/#tablite.base.Column.unique","title":"tablite.base.Column.unique() ","text":"returns unique list of values. example: >>> c = Column(data=['a','b','a','c','b'])\n>>> c.unqiue()\n['a','b','c']\n Source code in tablite/base.py def unique(self):\n \"\"\"\n returns unique list of values.\n\n example:\n ```\n >>> c = Column(data=['a','b','a','c','b'])\n >>> c.unqiue()\n ['a','b','c']\n ```\n \"\"\"\n arrays = []\n for page in set(self.pages):\n try: # when it works, numpy is fast...\n arrays.append(np.unique(page.get()))\n except TypeError: # ...but np.unique cannot handle Nones.\n arrays.append(multitype_set(page.get()))\n union = np_type_unify(arrays)\n try:\n return np.unique(union)\n except MemoryError:\n return np.array(set(union))\n except TypeError:\n return multitype_set(union)\n "},{"location":"reference/base/#tablite.base.Column.histogram","title":"tablite.base.Column.histogram() ","text":"returns 2 arrays: unique elements and count of each element example: >>> c = Column(data=['a','b','a','c','b'])\n>>> c.histogram()\n{'a':2,'b':2,'c':1}\n Source code in tablite/base.py def histogram(self):\n \"\"\"\n returns 2 arrays: unique elements and count of each element\n\n example:\n ```\n >>> c = Column(data=['a','b','a','c','b'])\n >>> c.histogram()\n {'a':2,'b':2,'c':1}\n ```\n \"\"\"\n d = defaultdict(int)\n for page in self.pages:\n try:\n uarray, carray = np.unique(page.get(), return_counts=True)\n except TypeError:\n uarray = page.get()\n carray = repeat(1, len(uarray))\n\n for i, c in zip(uarray, carray):\n v = numpy_to_python(i)\n d[(type(v), v)] += numpy_to_python(c)\n u = [v for _, v in d.keys()]\n c = list(d.values())\n return u, c # unique, counts\n "},{"location":"reference/base/#tablite.base.Column.statistics","title":"tablite.base.Column.statistics() ","text":"provides summary statistics. RETURNS DESCRIPTION dict returns dict with: - min (int/float, length of str, date)
- max (int/float, length of str, date)
- mean (int/float, length of str, date)
- median (int/float, length of str, date)
- stdev (int/float, length of str, date)
- mode (int/float, length of str, date)
- distinct (int/float, length of str, date)
- iqr (int/float, length of str, date)
- sum (int/float, length of str, date)
- histogram (see .histogram)
Source code in tablite/base.py def statistics(self):\n \"\"\"provides summary statistics.\n\n Returns:\n dict: returns dict with:\n - min (int/float, length of str, date)\n - max (int/float, length of str, date)\n - mean (int/float, length of str, date)\n - median (int/float, length of str, date)\n - stdev (int/float, length of str, date)\n - mode (int/float, length of str, date)\n - distinct (int/float, length of str, date)\n - iqr (int/float, length of str, date)\n - sum (int/float, length of str, date)\n - histogram (see .histogram)\n \"\"\"\n values, counts = self.histogram()\n return summary_statistics(values, counts)\n "},{"location":"reference/base/#tablite.base.Column.count","title":"tablite.base.Column.count(item) ","text":"counts appearances of item in column. Note that in python, True == 1 and False == 0 , whereby the following difference occurs: in python: >>> L = [1, True]\n>>> L.count(True)\n2\n in tablite: >>> t = Table({'L': [1,True]})\n>>> t['L'].count(True)\n1\n PARAMETER DESCRIPTION item target item TYPE: Any RETURNS DESCRIPTION int number of occurrences of item. Source code in tablite/base.py def count(self, item):\n \"\"\"counts appearances of item in column.\n\n Note that in python, `True == 1` and `False == 0`,\n whereby the following difference occurs:\n\n in python:\n ```\n >>> L = [1, True]\n >>> L.count(True)\n 2\n ```\n in tablite:\n ```\n >>> t = Table({'L': [1,True]})\n >>> t['L'].count(True)\n 1\n ```\n\n Args:\n item (Any): target item\n\n Returns:\n int: number of occurrences of item.\n \"\"\"\n result = 0\n for page in self.pages:\n data = page.get()\n if data.dtype != \"O\":\n result += np.nonzero(page.get() == item)[0].shape[0]\n # what happens here ---^ below:\n # arr = page.get()\n # >>> arr\n # array([1,2,3,4,3], int64)\n # >>> (arr == 3)\n # array([False, False, True, False, True])\n # >>> np.nonzero(arr==3)\n # (array([2,4], dtype=int64), ) <-- tuple!\n # >>> np.nonzero(page.get() == item)[0]\n # array([2,4])\n # >>> np.nonzero(page.get() == item)[0].shape\n # (2, )\n # >>> np.nonzero(page.get() == item)[0].shape[0]\n # 2\n else:\n result += sum(1 for i in data if type(i) == type(item) and i == item)\n return result\n "},{"location":"reference/base/#tablite.base.BaseTable","title":"tablite.base.BaseTable(columns: [dict, None] = None, headers: [list, None] = None, rows: [list, None] = None, _path: [Path, None] = None) ","text":" Bases: object creates Table PARAMETER DESCRIPTION EITHER columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]}) _path path to main process working directory. TYPE: Path DEFAULT: None Source code in tablite/base.py def __init__(\n self,\n columns: [dict, None] = None,\n headers: [list, None] = None,\n rows: [list, None] = None,\n _path: [Path, None] = None,\n) -> None:\n \"\"\"creates Table\n\n Args:\n EITHER:\n columns (dict, optional): dict with column names as keys, values as lists.\n Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n OR\n headers (list of strings, optional): list of column names.\n rows (list of tuples or lists, optional): values for columns\n Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n\n _path (pathlib.Path, optional): path to main process working directory.\n \"\"\"\n if _path is None:\n if self._pid_dir is None:\n self._pid_dir = Path(Config.workdir) / Config.pid\n if not self._pid_dir.exists():\n self._pid_dir.mkdir()\n (self._pid_dir / \"pages\").mkdir()\n register(self._pid_dir)\n\n _path = Path(self._pid_dir)\n # if path exists under the given PID it will be overwritten.\n # this can only happen if the process previously was SIGKILLed.\n type_check(_path, Path)\n self.path = _path # filename used during multiprocessing.\n self.columns = {} # maps colunn names to instances of Column.\n\n # user friendly features.\n if columns and any((headers, rows)):\n raise ValueError(\"Either columns as dict OR headers and rows. Not both.\")\n\n if headers and rows:\n rotated = list(zip(*rows))\n columns = {k: v for k, v in zip(headers, rotated)}\n\n if columns:\n type_check(columns, dict)\n for k, v in columns.items():\n self.__setitem__(k, v)\n "},{"location":"reference/base/#tablite.base.BaseTable-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.BaseTable.path","title":"tablite.base.BaseTable.path = _path instance-attribute ","text":""},{"location":"reference/base/#tablite.base.BaseTable.columns","title":"tablite.base.BaseTable.columns = {} instance-attribute ","text":""},{"location":"reference/base/#tablite.base.BaseTable.rows","title":"tablite.base.BaseTable.rows property ","text":"enables row based iteration in python types. Example: for row in Table.rows:\n print(row)\n Yields: tuple: values is same order as columns. "},{"location":"reference/base/#tablite.base.BaseTable-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.BaseTable.__str__","title":"tablite.base.BaseTable.__str__() ","text":"Source code in tablite/base.py def __str__(self): # USER FUNCTION.\n return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n "},{"location":"reference/base/#tablite.base.BaseTable.__repr__","title":"tablite.base.BaseTable.__repr__() ","text":"Source code in tablite/base.py def __repr__(self):\n return self.__str__()\n "},{"location":"reference/base/#tablite.base.BaseTable.nbytes","title":"tablite.base.BaseTable.nbytes() ","text":"finds the total bytes of the table on disk RETURNS DESCRIPTION tuple int: real bytes used on disk int: total bytes used if flattened Source code in tablite/base.py def nbytes(self): # USER FUNCTION.\n \"\"\"finds the total bytes of the table on disk\n\n Returns:\n tuple:\n int: real bytes used on disk\n int: total bytes used if flattened\n \"\"\"\n real = {}\n total = 0\n for column in self.columns.values():\n for page in set(column.pages):\n real[page] = page.path.stat().st_size\n for page in column.pages:\n total += real[page]\n return sum(real.values()), total\n "},{"location":"reference/base/#tablite.base.BaseTable.items","title":"tablite.base.BaseTable.items() ","text":"returns table as dict RETURNS DESCRIPTION dict Table as dict {column_name: [values], ...} Source code in tablite/base.py def items(self): # USER FUNCTION.\n \"\"\"returns table as dict\n\n Returns:\n dict: Table as dict `{column_name: [values], ...}`\n \"\"\"\n return {\n name: column[:].tolist() for name, column in self.columns.items()\n }.items()\n "},{"location":"reference/base/#tablite.base.BaseTable.__delitem__","title":"tablite.base.BaseTable.__delitem__(key) ","text":"Examples: >>> del table['a'] # removes column 'a'\n>>> del table[-3:] # removes last 3 rows from all columns.\n Source code in tablite/base.py def __delitem__(self, key): # USER FUNCTION.\n \"\"\"\n Examples:\n ```\n >>> del table['a'] # removes column 'a'\n >>> del table[-3:] # removes last 3 rows from all columns.\n ```\n \"\"\"\n if isinstance(key, (int, slice)):\n for column in self.columns.values():\n del column[key]\n elif key in self.columns:\n del self.columns[key]\n else:\n raise KeyError(f\"Key not found: {key}\")\n "},{"location":"reference/base/#tablite.base.BaseTable.__setitem__","title":"tablite.base.BaseTable.__setitem__(key, value) ","text":"table behaves like a dict. Args: key (str or hashable): column name value (iterable): list, tuple or nd.array with values. As Table now accepts the keyword columns as a dict: >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n and the header/data combinations: >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n This has the side-benefit that tuples now can be used as headers. Source code in tablite/base.py def __setitem__(self, key, value): # USER FUNCTION\n \"\"\"table behaves like a dict.\n Args:\n key (str or hashable): column name\n value (iterable): list, tuple or nd.array with values.\n\n As Table now accepts the keyword `columns` as a dict:\n ```\n >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n ```\n and the header/data combinations:\n ```\n >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n ```\n This has the side-benefit that tuples now can be used as headers.\n \"\"\"\n if value is None:\n self.columns[key] = Column(self.path, value=None)\n elif isinstance(value, (list, tuple)):\n value = list_to_np_array(value)\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, (np.ndarray)):\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, Column):\n self.columns[key] = value\n else:\n raise TypeError(f\"{type(value)} not supported.\")\n "},{"location":"reference/base/#tablite.base.BaseTable.__getitem__","title":"tablite.base.BaseTable.__getitem__(keys) ","text":"Enables selection of columns and rows PARAMETER DESCRIPTION keys TYPE: column name, integer or slice Examples >>> 10] selects first 10 rows from all columns TYPE: table[ >>> 20:3] selects column 'b' and 'c' and 'a' twice for a slice. TYPE: table['b', 'a', 'a', 'c', 2 Raises: KeyError: if key is not found. TypeError: if key is not a string, integer or slice. RETURNS DESCRIPTION Table returns columns in same order as selection. Source code in tablite/base.py def __getitem__(self, keys): # USER FUNCTION\n \"\"\"\n Enables selection of columns and rows\n\n Args:\n keys (column name, integer or slice):\n Examples:\n ```\n >>> table['a'] selects column 'a'\n >>> table[3] selects row 3 as a tuple.\n >>> table[:10] selects first 10 rows from all columns\n >>> table['a','b', slice(3,20,2)] selects a slice from columns 'a' and 'b'\n >>> table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n >>> table[('b', 'a', 'a', 'c')] selects columns 'b', 'a', 'a', and 'c' using a tuple.\n ```\n Raises:\n KeyError: if key is not found.\n TypeError: if key is not a string, integer or slice.\n\n Returns:\n Table: returns columns in same order as selection.\n \"\"\"\n\n if not isinstance(keys, tuple):\n if isinstance(keys, list):\n keys = tuple(keys)\n else:\n keys = (keys,)\n if isinstance(keys[0], tuple):\n keys = tuple(list(chain(*keys)))\n\n integers = [i for i in keys if isinstance(i, int)]\n if len(integers) == len(keys) == 1: # return a single tuple.\n keys = [slice(keys[0])]\n\n column_names = [i for i in keys if isinstance(i, str)]\n column_names = list(self.columns) if not column_names else column_names\n not_found = [name for name in column_names if name not in self.columns]\n if not_found:\n raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n slices = [i for i in keys if isinstance(i, slice)]\n slc = slice(0, len(self)) if not slices else slices[0]\n\n if (\n len(slices) == 0 and len(column_names) == 1\n ): # e.g. tbl['a'] or tbl['a'][:10]\n col = self.columns[column_names[0]]\n if slices:\n return col[slc] # return slice from column as list of values\n else:\n return col # return whole column\n\n elif len(integers) == 1: # return a single tuple.\n row_no = integers[0]\n slc = slice(row_no, row_no + 1)\n return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n elif not slices: # e.g. new table with N whole columns.\n return self.__class__(\n columns={name: self.columns[name] for name in column_names}\n )\n\n else: # e.g. new table from selection of columns and slices.\n t = self.__class__()\n for name in column_names:\n column = self.columns[name]\n\n new_column = Column(t.path) # create new Column.\n for item in column.getpages(slc):\n if isinstance(item, np.ndarray):\n new_column.extend(item) # extend subslice (expensive)\n elif isinstance(item, SimplePage):\n new_column.pages.append(item) # extend page (cheap)\n else:\n raise TypeError(f\"Bad item: {item}\")\n\n # below:\n # set the new column directly on t.columns.\n # Do not use t[name] as that triggers __setitem__ again.\n t.columns[name] = new_column\n\n return t\n "},{"location":"reference/base/#tablite.base.BaseTable.__len__","title":"tablite.base.BaseTable.__len__() ","text":"Source code in tablite/base.py def __len__(self): # USER FUNCTION.\n if not self.columns:\n return 0\n return max(len(c) for c in self.columns.values())\n "},{"location":"reference/base/#tablite.base.BaseTable.__eq__","title":"tablite.base.BaseTable.__eq__(other) -> bool ","text":"Determines if two tables have identical content. PARAMETER DESCRIPTION other table for comparison TYPE: Table RETURNS DESCRIPTION bool True if tables are identical. TYPE: bool Source code in tablite/base.py def __eq__(self, other) -> bool: # USER FUNCTION.\n \"\"\"Determines if two tables have identical content.\n\n Args:\n other (Table): table for comparison\n\n Returns:\n bool: True if tables are identical.\n \"\"\"\n if isinstance(other, dict):\n return self.items() == other.items()\n if not isinstance(other, BaseTable):\n return False\n if id(self) == id(other):\n return True\n if len(self) != len(other):\n return False\n if len(self) == len(other) == 0:\n return True\n if self.columns.keys() != other.columns.keys():\n return False\n for name, col in self.columns.items():\n if not (col == other.columns[name]):\n return False\n return True\n "},{"location":"reference/base/#tablite.base.BaseTable.clear","title":"tablite.base.BaseTable.clear() ","text":"clears the table. Like dict().clear() Source code in tablite/base.py def clear(self): # USER FUNCTION.\n \"\"\"clears the table. Like dict().clear()\"\"\"\n self.columns.clear()\n "},{"location":"reference/base/#tablite.base.BaseTable.save","title":"tablite.base.BaseTable.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1) ","text":"saves table to compressed tpz file. PARAMETER DESCRIPTION path file destination. TYPE: Path compression_method See zipfile compression methods. Defaults to ZIP_DEFLATED. DEFAULT: ZIP_DEFLATED compression_level See zipfile compression levels. Defaults to 1. DEFAULT: 1 The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files. The zip contains table.yml which provides an overview of the data: --------------------------------------\n%YAML 1.2 yaml version\ncolumns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n----------------------------------------\n Source code in tablite/base.py def save(\n self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n): # USER FUNCTION.\n \"\"\"saves table to compressed tpz file.\n\n Args:\n path (Path): file destination.\n compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n compression_level: See zipfile compression levels. Defaults to 1.\n The default settings produce 80% compression at 10% slowdown.\n\n The file format is as follows:\n .tpz is a gzip archive with table metadata captured as table.yml\n and the necessary set of pages saved as .npy files.\n\n The zip contains table.yml which provides an overview of the data:\n ```\n --------------------------------------\n %YAML 1.2 yaml version\n columns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n ----------------------------------------\n ```\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n type_check(path, Path)\n if path.is_dir():\n raise TypeError(f\"filename needed: {path}\")\n if path.suffix != \".tpz\":\n path = path.parent / (path.parts[-1] + \".tpz\")\n\n # create yaml document\n _page_counter = 0\n d = {}\n cols = {}\n for name, col in self.columns.items():\n type_check(col, Column)\n cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n _page_counter += len(col.pages)\n d[\"columns\"] = cols\n yml = yaml.safe_dump(\n d, sort_keys=False, allow_unicode=True, default_flow_style=None\n )\n\n _file_counter = 0\n with zipfile.ZipFile(\n path, \"w\", compression=compression_method, compresslevel=compression_level\n ) as f:\n log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n f.writestr(\"table.yml\", yml)\n for name, col in self.columns.items():\n for page in set(\n col.pages\n ): # set of pages! remember t *= 1000 repeats t 1000x\n with open(page.path, \"rb\", buffering=0) as raw_io:\n f.writestr(page.path.name, raw_io.read())\n _file_counter += 1\n log.debug(f\"adding Page {page.path}\")\n\n _fields = len(self) * len(self.columns)\n _avg = _fields // _page_counter\n log.debug(\n f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n )\n "},{"location":"reference/base/#tablite.base.BaseTable.load","title":"tablite.base.BaseTable.load(path, tqdm=_tqdm) classmethod ","text":"loads a table from .tpz file. See also Table.save for details on the file format. PARAMETER DESCRIPTION path source file TYPE: Path RETURNS DESCRIPTION Table table in read-only mode. Source code in tablite/base.py @classmethod\ndef load(cls, path, tqdm=_tqdm): # USER FUNCTION.\n \"\"\"loads a table from .tpz file.\n See also Table.save for details on the file format.\n\n Args:\n path (Path): source file\n\n Returns:\n Table: table in read-only mode.\n \"\"\"\n path = Path(path)\n log.debug(f\"loading {path}\")\n with zipfile.ZipFile(path, \"r\") as f:\n yml = f.read(\"table.yml\")\n metadata = yaml.safe_load(yml)\n t = cls()\n\n page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n with tqdm(\n total=page_count,\n desc=f\"loading '{path.name}' file\",\n disable=Config.TQDM_DISABLE,\n ) as pbar:\n for name, d in metadata[\"columns\"].items():\n column = Column(t.path)\n for page in d[\"pages\"]:\n bytestream = io.BytesIO(f.read(page))\n data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n column.extend(data)\n pbar.update(1)\n t.columns[name] = column\n update_access_time(path)\n return t\n "},{"location":"reference/base/#tablite.base.BaseTable.copy","title":"tablite.base.BaseTable.copy() ","text":"Source code in tablite/base.py def copy(self):\n cls = type(self)\n t = cls()\n for name, column in self.columns.items():\n new = Column(t.path)\n new.pages = column.pages[:]\n t.columns[name] = new\n return t\n "},{"location":"reference/base/#tablite.base.BaseTable.__imul__","title":"tablite.base.BaseTable.__imul__(other) ","text":"Repeats instance of table N times. Like list: t = t * N PARAMETER DESCRIPTION other multiplier TYPE: int Source code in tablite/base.py def __imul__(self, other):\n \"\"\"Repeats instance of table N times.\n\n Like list: `t = t * N`\n\n Args:\n other (int): multiplier\n \"\"\"\n if not (isinstance(other, int) and other > 0):\n raise TypeError(\n f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n )\n for col in self.columns.values():\n col *= other\n return self\n "},{"location":"reference/base/#tablite.base.BaseTable.__mul__","title":"tablite.base.BaseTable.__mul__(other) ","text":"Repeat table N times. Like list: new = old * N PARAMETER DESCRIPTION other multiplier TYPE: int RETURNS DESCRIPTION Table Source code in tablite/base.py def __mul__(self, other):\n \"\"\"Repeat table N times.\n Like list: `new = old * N`\n\n Args:\n other (int): multiplier\n\n Returns:\n Table\n \"\"\"\n new = self.copy()\n return new.__imul__(other)\n "},{"location":"reference/base/#tablite.base.BaseTable.__iadd__","title":"tablite.base.BaseTable.__iadd__(other) ","text":"Concatenates tables with same column names. Like list: table_1 += table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION None self is updated. Source code in tablite/base.py def __iadd__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_1 += table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n None: self is updated.\n \"\"\"\n type_check(other, BaseTable)\n for name in self.columns.keys():\n if name not in other.columns:\n raise ValueError(f\"{name} not in other\")\n for name in other.columns.keys():\n if name not in self.columns:\n raise ValueError(f\"{name} missing from self\")\n\n for name, column in self.columns.items():\n other_col = other.columns.get(name, None)\n column.pages.extend(other_col.pages[:])\n return self\n "},{"location":"reference/base/#tablite.base.BaseTable.__add__","title":"tablite.base.BaseTable.__add__(other) ","text":"Concatenates tables with same column names. Like list: table_3 = table_1 + table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION Table Source code in tablite/base.py def __add__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_3 = table_1 + table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n Table\n \"\"\"\n type_check(other, BaseTable)\n cp = self.copy()\n cp += other\n return cp\n "},{"location":"reference/base/#tablite.base.BaseTable.add_rows","title":"tablite.base.BaseTable.add_rows(*args, **kwargs) ","text":"its more efficient to add many rows at once. if both args and kwargs, then args are added first, followed by kwargs. supported cases: >>> t = Table()\n>>> t.add_columns('row','A','B','C')\n>>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n>>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n>>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n>>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n>>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n>>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n>>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n>>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n>>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n>>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n>>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n Source code in tablite/base.py def add_rows(self, *args, **kwargs):\n \"\"\"its more efficient to add many rows at once.\n\n if both args and kwargs, then args are added first, followed by kwargs.\n\n supported cases:\n ```\n >>> t = Table()\n >>> t.add_columns('row','A','B','C')\n >>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n >>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n >>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n >>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n >>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n >>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n >>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n >>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n >>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n >>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n >>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n ```\n\n \"\"\"\n if not BaseTable._add_row_slow_warning:\n warnings.warn(\n \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n )\n BaseTable._add_row_slow_warning = True\n\n if args:\n if not all(isinstance(i, (list, tuple, dict)) for i in args): # 1,4\n args = [args]\n\n if all(isinstance(i, (list, tuple, dict)) for i in args): # 2,3,7,8\n # 1. turn the data into columns:\n\n d = {n: [] for n in self.columns}\n for arg in args:\n if len(arg) != len(self.columns):\n raise ValueError(\n f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n )\n\n if isinstance(arg, dict):\n for k, v in arg.items(): # 7,8\n d[k].append(v)\n\n elif isinstance(arg, (list, tuple)): # 2,3\n for n, v in zip(self.columns, arg):\n d[n].append(v)\n\n else:\n raise TypeError(f\"{arg}?\")\n # 2. extend the columns\n for n, values in d.items():\n col = self.columns[n]\n col.extend(list_to_np_array(values))\n\n if kwargs:\n if isinstance(kwargs, dict):\n if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(list_to_np_array(v))\n else:\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(np.array([v]))\n else:\n raise ValueError(f\"format not recognised: {kwargs}\")\n\n return\n "},{"location":"reference/base/#tablite.base.BaseTable.add_columns","title":"tablite.base.BaseTable.add_columns(*names) ","text":"Adds column names to table. Source code in tablite/base.py def add_columns(self, *names):\n \"\"\"Adds column names to table.\"\"\"\n for name in names:\n self.columns[name] = Column(self.path)\n "},{"location":"reference/base/#tablite.base.BaseTable.add_column","title":"tablite.base.BaseTable.add_column(name, data=None) ","text":"verbose alias for table[name] = data, that checks if name already exists PARAMETER DESCRIPTION name column name TYPE: str data values. Defaults to None. TYPE: list,tuple) DEFAULT: None RAISES DESCRIPTION TypeError name isn't string ValueError name already exists Source code in tablite/base.py def add_column(self, name, data=None):\n \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n Args:\n name (str): column name\n data ((list,tuple), optional): values. Defaults to None.\n\n Raises:\n TypeError: name isn't string\n ValueError: name already exists\n \"\"\"\n if not isinstance(name, str):\n raise TypeError(\"expected name as string\")\n if name in self.columns:\n raise ValueError(f\"{name} already in {self.columns}\")\n self.__setitem__(name, data)\n "},{"location":"reference/base/#tablite.base.BaseTable.stack","title":"tablite.base.BaseTable.stack(other) ","text":"returns the joint stack of tables with overlapping column names. Example: | Table A| + | Table B| = | Table AB |\n| A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n Source code in tablite/base.py def stack(self, other):\n \"\"\"\n returns the joint stack of tables with overlapping column names.\n Example:\n ```\n | Table A| + | Table B| = | Table AB |\n | A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n ```\n \"\"\"\n if not isinstance(other, BaseTable):\n raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n cp = self.copy()\n for name, col2 in other.columns.items():\n if name not in cp.columns:\n cp[name] = [None] * len(self)\n cp[name].pages.extend(col2.pages[:])\n\n for name in self.columns:\n if name not in other.columns:\n if len(cp) > 0:\n cp[name].extend(np.array([None] * len(other)))\n return cp\n "},{"location":"reference/base/#tablite.base.BaseTable.types","title":"tablite.base.BaseTable.types() ","text":"returns nested dict of data types in the form: {column name: {python type class: number of instances }, ... } example: >>> t.types()\n{\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n}\n Source code in tablite/base.py def types(self):\n \"\"\"\n returns nested dict of data types in the form:\n `{column name: {python type class: number of instances }, ... }`\n\n example:\n ```\n >>> t.types()\n {\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n }\n ```\n \"\"\"\n d = {}\n for name, col in self.columns.items():\n assert isinstance(col, Column)\n d[name] = col.types()\n return d\n "},{"location":"reference/base/#tablite.base.BaseTable.display_dict","title":"tablite.base.BaseTable.display_dict(slice_=None, blanks=None, dtype=False) ","text":"helper for creating dict for display. PARAMETER DESCRIPTION slice_ python slice. Defaults to None. TYPE: slice DEFAULT: None blanks fill value for None . Defaults to None. TYPE: optional DEFAULT: None dtype Adds datatype to each column. Defaults to False. TYPE: bool DEFAULT: False RAISES DESCRIPTION TypeError slice_ must be None or slice. RETURNS DESCRIPTION dict from Table. Source code in tablite/base.py def display_dict(self, slice_=None, blanks=None, dtype=False):\n \"\"\"helper for creating dict for display.\n\n Args:\n slice_ (slice, optional): python slice. Defaults to None.\n blanks (optional): fill value for `None`. Defaults to None.\n dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n Raises:\n TypeError: slice_ must be None or slice.\n\n Returns:\n dict: from Table.\n \"\"\"\n if not self.columns:\n print(\"Empty Table\")\n return\n\n def datatype(col): # PRIVATE\n \"\"\"creates label for column datatype.\"\"\"\n types = col.types()\n if len(types) == 0:\n typ = \"empty\"\n elif len(types) == 1:\n dt, _ = types.popitem()\n typ = dt.__name__\n else:\n typ = \"mixed\"\n return typ\n\n row_count_tags = [\"#\", \"~\", \"*\"]\n cols = set(self.columns)\n for n, tag in product(range(1, 6), row_count_tags):\n if n * tag not in cols:\n tag = n * tag\n break\n\n if not isinstance(slice_, (slice, type(None))):\n raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n if isinstance(slice_, slice):\n slc = slice_\n if slice_ is None:\n if len(self) <= 20:\n slc = slice(0, 20, 1)\n else:\n slc = None\n\n n = len(self)\n if slc: # either we want slc or we want everything.\n row_no = list(range(*slc.indices(len(self))))\n data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n for name, col in self.columns.items():\n data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n slc\n ]\n else:\n data = {}\n j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n row_no = (\n [f\"{i:,}\".rjust(j) for i in range(7)]\n + [\"...\"]\n + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n )\n data = {tag: row_no}\n\n for name, col in self.columns.items():\n if len(col) == n:\n row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n else:\n empty = [blanks] * 7\n head = (col[:7].tolist() + empty)[:7]\n tail = (col[n - 7 :].tolist() + empty)[-7:]\n row = head + [\"...\"] + tail\n data[name] = row\n\n if dtype:\n for name, values in data.items():\n if name in self.columns:\n col = self.columns[name]\n values.insert(0, datatype(col))\n else:\n values.insert(0, \"row\")\n\n return data\n "},{"location":"reference/base/#tablite.base.BaseTable.to_ascii","title":"tablite.base.BaseTable.to_ascii(slice_=None, blanks=None, dtype=False) ","text":"returns ascii view of table as string. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def to_ascii(self, slice_=None, blanks=None, dtype=False):\n \"\"\"returns ascii view of table as string.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n\n def adjust(v, length): # PRIVATE FUNCTION\n \"\"\"whitespace justifies field values based on datatype\"\"\"\n if v is None:\n return str(blanks).ljust(length)\n elif isinstance(v, str):\n return v.ljust(length)\n else:\n return str(v).rjust(length)\n\n if not self.columns:\n return str(self)\n\n d = {}\n for name, values in self.display_dict(\n slice_=slice_, blanks=blanks, dtype=dtype\n ).items():\n as_text = [str(v) for v in values] + [str(name)]\n width = max(len(i) for i in as_text)\n new_name = name.center(width, \" \")\n if dtype:\n values[0] = values[0].center(width, \" \")\n d[new_name] = [adjust(v, width) for v in values]\n\n rows = dict_to_rows(d)\n s = []\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n s.append(\"|\" + \"|\".join(rows[0]) + \"|\") # column names\n start = 1\n if dtype:\n s.append(\"|\" + \"|\".join(rows[1]) + \"|\") # datatypes\n start = 2\n\n s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n for row in rows[start:]:\n s.append(\"|\" + \"|\".join(row) + \"|\")\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n if len(set(len(c) for c in self.columns.values())) != 1:\n warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n s.append(warning)\n\n return \"\\n\".join(s)\n "},{"location":"reference/base/#tablite.base.BaseTable.show","title":"tablite.base.BaseTable.show(slice_=None, blanks=None, dtype=False) ","text":"prints ascii view of table. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def show(self, slice_=None, blanks=None, dtype=False):\n \"\"\"prints ascii view of table.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n "},{"location":"reference/base/#tablite.base.BaseTable.to_dict","title":"tablite.base.BaseTable.to_dict(columns=None, slice_=None) ","text":"columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows. returns: dict with columns as keys and lists of values. Example: >>> t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 | 1| 3|\n| 1 | 2| 4|\n+===+===+===+\n>>> t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n Source code in tablite/base.py def to_dict(self, columns=None, slice_=None):\n \"\"\"\n columns: list of column names. Default is None == all columns.\n slice_: slice. Default is None == all rows.\n\n returns: dict with columns as keys and lists of values.\n\n Example:\n ```\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 3|\n | 1 | 2| 4|\n +===+===+===+\n >>> t.to_dict()\n {'a':[1,2], 'b':[3,4]}\n ```\n\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n assert isinstance(slice_, slice)\n\n if columns is None:\n columns = list(self.columns.keys())\n if not isinstance(columns, list):\n raise TypeError(\"expected columns as list of strings\")\n\n return {name: list(self.columns[name][slice_]) for name in columns}\n "},{"location":"reference/base/#tablite.base.BaseTable.as_json_serializable","title":"tablite.base.BaseTable.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None) ","text":"provides a JSON compatible format of the table. PARAMETER DESCRIPTION row_count Label for row counts. Defaults to \"row id\". TYPE: str DEFAULT: 'row id' start_on row counts starts by default on 1. TYPE: int DEFAULT: 1 columns Column names. Defaults to None which returns all columns. TYPE: list of str DEFAULT: None slice_ selector. Defaults to None which returns [:] TYPE: slice DEFAULT: None RETURNS DESCRIPTION JSON serializable dict: All python datatypes have been converted to JSON compliant data. Source code in tablite/base.py def as_json_serializable(\n self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n \"\"\"provides a JSON compatible format of the table.\n\n Args:\n row_count (str, optional): Label for row counts. Defaults to \"row id\".\n start_on (int, optional): row counts starts by default on 1.\n columns (list of str, optional): Column names.\n Defaults to None which returns all columns.\n slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n Returns:\n JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n\n assert isinstance(slice_, slice)\n new = {\"columns\": {}, \"total_rows\": len(self)}\n if row_count is not None:\n new[\"columns\"][row_count] = [\n i + start_on for i in range(*slice_.indices(len(self)))\n ]\n\n d = self.to_dict(columns, slice_=slice_)\n for k, data in d.items():\n new_k = unique_name(\n k, new[\"columns\"]\n ) # used to avoid overwriting the `row id` key.\n new[\"columns\"][new_k] = [\n DataTypes.to_json(v) for v in data\n ] # deal with non-json datatypes.\n return new\n "},{"location":"reference/base/#tablite.base.BaseTable.index","title":"tablite.base.BaseTable.index(*args) ","text":"param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...} Examples: >>> table6 = Table()\n>>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n>>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n >>> table6.index('A') # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n >>> table6.index('A', 'B') # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n Source code in tablite/base.py def index(self, *args):\n \"\"\"\n param: *args: column names\n returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n Examples:\n ```\n >>> table6 = Table()\n >>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n >>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n ```\n\n ```\n >>> table6.index('A') # single key.\n {('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n ```\n\n ```\n >>> table6.index('A', 'B') # multiple keys.\n {('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n ```\n\n \"\"\"\n idx = defaultdict(list)\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in enumerate(zip(*iterators)):\n key = tuple(numpy_to_python(k) for k in key)\n idx[key].append(ix)\n return idx\n "},{"location":"reference/base/#tablite.base.BaseTable.unique_index","title":"tablite.base.BaseTable.unique_index(*args, tqdm=_tqdm) ","text":"generates the index of unique rows given a list of column names PARAMETER DESCRIPTION *args columns names TYPE: any DEFAULT: () tqdm Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION np.array(int64): indices of unique records. Source code in tablite/base.py def unique_index(self, *args, tqdm=_tqdm):\n \"\"\"generates the index of unique rows given a list of column names\n\n Args:\n *args (any): columns names\n tqdm (tqdm, optional): Defaults to _tqdm.\n\n Returns:\n np.array(int64): indices of unique records.\n \"\"\"\n if not args:\n raise ValueError(\"*args (column names) is required\")\n seen = set()\n unique = set()\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n key_hash = hash(tuple(numpy_to_python(k) for k in key))\n if key_hash in seen:\n continue\n else:\n seen.add(key_hash)\n unique.add(ix)\n return np.array(sorted(unique))\n "},{"location":"reference/base/#tablite.base-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.register","title":"tablite.base.register(path) ","text":"registers path in file_registry The method is used by Table during init when the working directory path is set, so that python can clean all temporary files up at exit. PARAMETER DESCRIPTION path typically tmp/tablite-tmp/PID-{os.getpid()} TYPE: Path Source code in tablite/base.py def register(path):\n \"\"\"registers path in file_registry\n\n The method is used by Table during init when the working directory path\n is set, so that python can clean all temporary files up at exit.\n\n Args:\n path (Path): typically tmp/tablite-tmp/PID-{os.getpid()}\n \"\"\"\n global file_registry\n file_registry.add(path)\n "},{"location":"reference/base/#tablite.base.shutdown","title":"tablite.base.shutdown() ","text":"method to clean up temporary files triggered at shutdown. Source code in tablite/base.py def shutdown():\n \"\"\"method to clean up temporary files triggered at shutdown.\"\"\"\n for path in file_registry:\n if Config.pid in str(path): # safety feature to prevent rm -rf /\n log.debug(f\"shutdown: running rmtree({path})\")\n shutil.rmtree(path)\n "},{"location":"reference/config/","title":"Config","text":""},{"location":"reference/config/#tablite.config","title":"tablite.config ","text":""},{"location":"reference/config/#tablite.config-classes","title":"Classes","text":""},{"location":"reference/config/#tablite.config.Config","title":"tablite.config.Config ","text":" Bases: object Config class for Tablite Tables. The default location for the storage is loaded as Config.workdir = pathlib.Path(os.environ.get(\"TABLITE_TMPDIR\", f\"{tempfile.gettempdir()}/tablite-tmp\"))\n to overwrite, first import the config class, then set the new workdir. >>> from tablite import config\n>>> from pathlib import Path\n>>> config.workdir = Path(\"/this/new/location\")\n the new path will now be used for every new table. PAGE_SIZE = 1_000_000 sets the page size limit. Multiprocessing is enabled in one of three modes: AUTO = \"auto\" FALSE = \"sp\" FORCE = \"mp\" MULTIPROCESSING_MODE = AUTO is default. SINGLE_PROCESSING_LIMIT = 1_000_000 when the number of fields (rows x columns) exceed this value, multiprocessing is used. "},{"location":"reference/config/#tablite.config.Config-attributes","title":"Attributes","text":""},{"location":"reference/config/#tablite.config.Config.USE_NIMPORTER","title":"tablite.config.Config.USE_NIMPORTER = os.environ.get('USE_NIMPORTER', 'true').lower() in ['1', 't', 'true', 'y', 'yes'] class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH","title":"tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH = os.environ.get('ALLOW_CSV_READER_FALLTHROUGH', 'true').lower() in ['1', 't', 'true', 'y', 'yes'] class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.NIM_SUPPORTED_CONV_TYPES","title":"tablite.config.Config.NIM_SUPPORTED_CONV_TYPES = ['Windows-1252', 'ISO-8859-1'] class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.workdir","title":"tablite.config.Config.workdir = pathlib.Path(os.environ.get('TABLITE_TMPDIR', f'{tempfile.gettempdir()}/tablite-tmp')) class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.pid","title":"tablite.config.Config.pid = f'pid-{os.getpid()}' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.PAGE_SIZE","title":"tablite.config.Config.PAGE_SIZE = 1000000 class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.ENCODING","title":"tablite.config.Config.ENCODING = 'UTF-8' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.DISK_LIMIT","title":"tablite.config.Config.DISK_LIMIT = int(10000000000.0) class-attribute instance-attribute ","text":"10e9 (10Gb) on 100 Gb disk means raise at 90 Gb disk usage. if DISK_LIMIT <= 0, the check is turned off. "},{"location":"reference/config/#tablite.config.Config.SINGLE_PROCESSING_LIMIT","title":"tablite.config.Config.SINGLE_PROCESSING_LIMIT = 1000000 class-attribute instance-attribute ","text":"when the number of fields (rows x columns) exceed this value, multiprocessing is used. "},{"location":"reference/config/#tablite.config.Config.vpus","title":"tablite.config.Config.vpus = max(os.cpu_count() - 1, 1) class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.AUTO","title":"tablite.config.Config.AUTO = 'auto' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.FALSE","title":"tablite.config.Config.FALSE = 'sp' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.FORCE","title":"tablite.config.Config.FORCE = 'mp' class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.MULTIPROCESSING_MODE","title":"tablite.config.Config.MULTIPROCESSING_MODE = AUTO class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config.TQDM_DISABLE","title":"tablite.config.Config.TQDM_DISABLE = False class-attribute instance-attribute ","text":""},{"location":"reference/config/#tablite.config.Config-functions","title":"Functions","text":""},{"location":"reference/config/#tablite.config.Config.reset","title":"tablite.config.Config.reset() classmethod ","text":"Resets the config class to original values. Source code in tablite/config.py @classmethod\ndef reset(cls):\n \"\"\"Resets the config class to original values.\"\"\"\n for k, v in _default_values.items():\n setattr(Config, k, v)\n "},{"location":"reference/config/#tablite.config.Config.page_steps","title":"tablite.config.Config.page_steps(length) classmethod ","text":"an iterator that yield start and end in page sizes YIELDS DESCRIPTION tuple start:int, end:int Source code in tablite/config.py @classmethod\ndef page_steps(cls, length):\n \"\"\"an iterator that yield start and end in page sizes\n\n Yields:\n tuple: start:int, end:int\n \"\"\"\n start, end = 0, 0\n for _ in range(0, length + 1, cls.PAGE_SIZE):\n start, end = end, min(end + cls.PAGE_SIZE, length)\n yield start, end\n if end == length:\n return\n "},{"location":"reference/core/","title":"Core","text":""},{"location":"reference/core/#tablite.core","title":"tablite.core ","text":""},{"location":"reference/core/#tablite.core-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.log","title":"tablite.core.log = logging.getLogger(__name__) module-attribute ","text":""},{"location":"reference/core/#tablite.core-classes","title":"Classes","text":""},{"location":"reference/core/#tablite.core.Table","title":"tablite.core.Table(columns=None, headers=None, rows=None, _path=None) ","text":" Bases: BaseTable creates Table PARAMETER DESCRIPTION EITHER columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]}) Source code in tablite/core.py def __init__(self, columns=None, headers=None, rows=None, _path=None) -> None:\n \"\"\"creates Table\n\n Args:\n EITHER:\n columns (dict, optional): dict with column names as keys, values as lists.\n Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n OR\n headers (list of strings, optional): list of column names.\n rows (list of tuples or lists, optional): values for columns\n Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n \"\"\"\n super().__init__(columns, headers, rows, _path)\n "},{"location":"reference/core/#tablite.core.Table-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.Table.path","title":"tablite.core.Table.path = _path instance-attribute ","text":""},{"location":"reference/core/#tablite.core.Table.columns","title":"tablite.core.Table.columns = {} instance-attribute ","text":""},{"location":"reference/core/#tablite.core.Table.rows","title":"tablite.core.Table.rows property ","text":"enables row based iteration in python types. Example: for row in Table.rows:\n print(row)\n Yields: tuple: values is same order as columns. "},{"location":"reference/core/#tablite.core.Table-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core.Table.__str__","title":"tablite.core.Table.__str__() ","text":"Source code in tablite/base.py def __str__(self): # USER FUNCTION.\n return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n "},{"location":"reference/core/#tablite.core.Table.__repr__","title":"tablite.core.Table.__repr__() ","text":"Source code in tablite/base.py def __repr__(self):\n return self.__str__()\n "},{"location":"reference/core/#tablite.core.Table.nbytes","title":"tablite.core.Table.nbytes() ","text":"finds the total bytes of the table on disk RETURNS DESCRIPTION tuple int: real bytes used on disk int: total bytes used if flattened Source code in tablite/base.py def nbytes(self): # USER FUNCTION.\n \"\"\"finds the total bytes of the table on disk\n\n Returns:\n tuple:\n int: real bytes used on disk\n int: total bytes used if flattened\n \"\"\"\n real = {}\n total = 0\n for column in self.columns.values():\n for page in set(column.pages):\n real[page] = page.path.stat().st_size\n for page in column.pages:\n total += real[page]\n return sum(real.values()), total\n "},{"location":"reference/core/#tablite.core.Table.items","title":"tablite.core.Table.items() ","text":"returns table as dict RETURNS DESCRIPTION dict Table as dict {column_name: [values], ...} Source code in tablite/base.py def items(self): # USER FUNCTION.\n \"\"\"returns table as dict\n\n Returns:\n dict: Table as dict `{column_name: [values], ...}`\n \"\"\"\n return {\n name: column[:].tolist() for name, column in self.columns.items()\n }.items()\n "},{"location":"reference/core/#tablite.core.Table.__delitem__","title":"tablite.core.Table.__delitem__(key) ","text":"Examples: >>> del table['a'] # removes column 'a'\n>>> del table[-3:] # removes last 3 rows from all columns.\n Source code in tablite/base.py def __delitem__(self, key): # USER FUNCTION.\n \"\"\"\n Examples:\n ```\n >>> del table['a'] # removes column 'a'\n >>> del table[-3:] # removes last 3 rows from all columns.\n ```\n \"\"\"\n if isinstance(key, (int, slice)):\n for column in self.columns.values():\n del column[key]\n elif key in self.columns:\n del self.columns[key]\n else:\n raise KeyError(f\"Key not found: {key}\")\n "},{"location":"reference/core/#tablite.core.Table.__setitem__","title":"tablite.core.Table.__setitem__(key, value) ","text":"table behaves like a dict. Args: key (str or hashable): column name value (iterable): list, tuple or nd.array with values. As Table now accepts the keyword columns as a dict: >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n and the header/data combinations: >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n This has the side-benefit that tuples now can be used as headers. Source code in tablite/base.py def __setitem__(self, key, value): # USER FUNCTION\n \"\"\"table behaves like a dict.\n Args:\n key (str or hashable): column name\n value (iterable): list, tuple or nd.array with values.\n\n As Table now accepts the keyword `columns` as a dict:\n ```\n >>> t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n ```\n and the header/data combinations:\n ```\n >>> t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n ```\n This has the side-benefit that tuples now can be used as headers.\n \"\"\"\n if value is None:\n self.columns[key] = Column(self.path, value=None)\n elif isinstance(value, (list, tuple)):\n value = list_to_np_array(value)\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, (np.ndarray)):\n self.columns[key] = Column(self.path, value)\n elif isinstance(value, Column):\n self.columns[key] = value\n else:\n raise TypeError(f\"{type(value)} not supported.\")\n "},{"location":"reference/core/#tablite.core.Table.__getitem__","title":"tablite.core.Table.__getitem__(keys) ","text":"Enables selection of columns and rows PARAMETER DESCRIPTION keys TYPE: column name, integer or slice Examples >>> 10] selects first 10 rows from all columns TYPE: table[ >>> 20:3] selects column 'b' and 'c' and 'a' twice for a slice. TYPE: table['b', 'a', 'a', 'c', 2 Raises: KeyError: if key is not found. TypeError: if key is not a string, integer or slice. RETURNS DESCRIPTION Table returns columns in same order as selection. Source code in tablite/base.py def __getitem__(self, keys): # USER FUNCTION\n \"\"\"\n Enables selection of columns and rows\n\n Args:\n keys (column name, integer or slice):\n Examples:\n ```\n >>> table['a'] selects column 'a'\n >>> table[3] selects row 3 as a tuple.\n >>> table[:10] selects first 10 rows from all columns\n >>> table['a','b', slice(3,20,2)] selects a slice from columns 'a' and 'b'\n >>> table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n >>> table[('b', 'a', 'a', 'c')] selects columns 'b', 'a', 'a', and 'c' using a tuple.\n ```\n Raises:\n KeyError: if key is not found.\n TypeError: if key is not a string, integer or slice.\n\n Returns:\n Table: returns columns in same order as selection.\n \"\"\"\n\n if not isinstance(keys, tuple):\n if isinstance(keys, list):\n keys = tuple(keys)\n else:\n keys = (keys,)\n if isinstance(keys[0], tuple):\n keys = tuple(list(chain(*keys)))\n\n integers = [i for i in keys if isinstance(i, int)]\n if len(integers) == len(keys) == 1: # return a single tuple.\n keys = [slice(keys[0])]\n\n column_names = [i for i in keys if isinstance(i, str)]\n column_names = list(self.columns) if not column_names else column_names\n not_found = [name for name in column_names if name not in self.columns]\n if not_found:\n raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n slices = [i for i in keys if isinstance(i, slice)]\n slc = slice(0, len(self)) if not slices else slices[0]\n\n if (\n len(slices) == 0 and len(column_names) == 1\n ): # e.g. tbl['a'] or tbl['a'][:10]\n col = self.columns[column_names[0]]\n if slices:\n return col[slc] # return slice from column as list of values\n else:\n return col # return whole column\n\n elif len(integers) == 1: # return a single tuple.\n row_no = integers[0]\n slc = slice(row_no, row_no + 1)\n return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n elif not slices: # e.g. new table with N whole columns.\n return self.__class__(\n columns={name: self.columns[name] for name in column_names}\n )\n\n else: # e.g. new table from selection of columns and slices.\n t = self.__class__()\n for name in column_names:\n column = self.columns[name]\n\n new_column = Column(t.path) # create new Column.\n for item in column.getpages(slc):\n if isinstance(item, np.ndarray):\n new_column.extend(item) # extend subslice (expensive)\n elif isinstance(item, SimplePage):\n new_column.pages.append(item) # extend page (cheap)\n else:\n raise TypeError(f\"Bad item: {item}\")\n\n # below:\n # set the new column directly on t.columns.\n # Do not use t[name] as that triggers __setitem__ again.\n t.columns[name] = new_column\n\n return t\n "},{"location":"reference/core/#tablite.core.Table.__len__","title":"tablite.core.Table.__len__() ","text":"Source code in tablite/base.py def __len__(self): # USER FUNCTION.\n if not self.columns:\n return 0\n return max(len(c) for c in self.columns.values())\n "},{"location":"reference/core/#tablite.core.Table.__eq__","title":"tablite.core.Table.__eq__(other) -> bool ","text":"Determines if two tables have identical content. PARAMETER DESCRIPTION other table for comparison TYPE: Table RETURNS DESCRIPTION bool True if tables are identical. TYPE: bool Source code in tablite/base.py def __eq__(self, other) -> bool: # USER FUNCTION.\n \"\"\"Determines if two tables have identical content.\n\n Args:\n other (Table): table for comparison\n\n Returns:\n bool: True if tables are identical.\n \"\"\"\n if isinstance(other, dict):\n return self.items() == other.items()\n if not isinstance(other, BaseTable):\n return False\n if id(self) == id(other):\n return True\n if len(self) != len(other):\n return False\n if len(self) == len(other) == 0:\n return True\n if self.columns.keys() != other.columns.keys():\n return False\n for name, col in self.columns.items():\n if not (col == other.columns[name]):\n return False\n return True\n "},{"location":"reference/core/#tablite.core.Table.clear","title":"tablite.core.Table.clear() ","text":"clears the table. Like dict().clear() Source code in tablite/base.py def clear(self): # USER FUNCTION.\n \"\"\"clears the table. Like dict().clear()\"\"\"\n self.columns.clear()\n "},{"location":"reference/core/#tablite.core.Table.save","title":"tablite.core.Table.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1) ","text":"saves table to compressed tpz file. PARAMETER DESCRIPTION path file destination. TYPE: Path compression_method See zipfile compression methods. Defaults to ZIP_DEFLATED. DEFAULT: ZIP_DEFLATED compression_level See zipfile compression levels. Defaults to 1. DEFAULT: 1 The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files. The zip contains table.yml which provides an overview of the data: --------------------------------------\n%YAML 1.2 yaml version\ncolumns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n----------------------------------------\n Source code in tablite/base.py def save(\n self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n): # USER FUNCTION.\n \"\"\"saves table to compressed tpz file.\n\n Args:\n path (Path): file destination.\n compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n compression_level: See zipfile compression levels. Defaults to 1.\n The default settings produce 80% compression at 10% slowdown.\n\n The file format is as follows:\n .tpz is a gzip archive with table metadata captured as table.yml\n and the necessary set of pages saved as .npy files.\n\n The zip contains table.yml which provides an overview of the data:\n ```\n --------------------------------------\n %YAML 1.2 yaml version\n columns: start of columns section.\n name: \u201c\u5217 1\u201d name of column 1.\n pages: [p1b1, p1b2] list of pages in column 1.\n name: \u201c\u5217 2\u201d name of column 2\n pages: [p2b1, p2b2] list of pages in column 2.\n ----------------------------------------\n ```\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n type_check(path, Path)\n if path.is_dir():\n raise TypeError(f\"filename needed: {path}\")\n if path.suffix != \".tpz\":\n path = path.parent / (path.parts[-1] + \".tpz\")\n\n # create yaml document\n _page_counter = 0\n d = {}\n cols = {}\n for name, col in self.columns.items():\n type_check(col, Column)\n cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n _page_counter += len(col.pages)\n d[\"columns\"] = cols\n yml = yaml.safe_dump(\n d, sort_keys=False, allow_unicode=True, default_flow_style=None\n )\n\n _file_counter = 0\n with zipfile.ZipFile(\n path, \"w\", compression=compression_method, compresslevel=compression_level\n ) as f:\n log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n f.writestr(\"table.yml\", yml)\n for name, col in self.columns.items():\n for page in set(\n col.pages\n ): # set of pages! remember t *= 1000 repeats t 1000x\n with open(page.path, \"rb\", buffering=0) as raw_io:\n f.writestr(page.path.name, raw_io.read())\n _file_counter += 1\n log.debug(f\"adding Page {page.path}\")\n\n _fields = len(self) * len(self.columns)\n _avg = _fields // _page_counter\n log.debug(\n f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n )\n "},{"location":"reference/core/#tablite.core.Table.load","title":"tablite.core.Table.load(path, tqdm=_tqdm) classmethod ","text":"loads a table from .tpz file. See also Table.save for details on the file format. PARAMETER DESCRIPTION path source file TYPE: Path RETURNS DESCRIPTION Table table in read-only mode. Source code in tablite/base.py @classmethod\ndef load(cls, path, tqdm=_tqdm): # USER FUNCTION.\n \"\"\"loads a table from .tpz file.\n See also Table.save for details on the file format.\n\n Args:\n path (Path): source file\n\n Returns:\n Table: table in read-only mode.\n \"\"\"\n path = Path(path)\n log.debug(f\"loading {path}\")\n with zipfile.ZipFile(path, \"r\") as f:\n yml = f.read(\"table.yml\")\n metadata = yaml.safe_load(yml)\n t = cls()\n\n page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n with tqdm(\n total=page_count,\n desc=f\"loading '{path.name}' file\",\n disable=Config.TQDM_DISABLE,\n ) as pbar:\n for name, d in metadata[\"columns\"].items():\n column = Column(t.path)\n for page in d[\"pages\"]:\n bytestream = io.BytesIO(f.read(page))\n data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n column.extend(data)\n pbar.update(1)\n t.columns[name] = column\n update_access_time(path)\n return t\n "},{"location":"reference/core/#tablite.core.Table.copy","title":"tablite.core.Table.copy() ","text":"Source code in tablite/base.py def copy(self):\n cls = type(self)\n t = cls()\n for name, column in self.columns.items():\n new = Column(t.path)\n new.pages = column.pages[:]\n t.columns[name] = new\n return t\n "},{"location":"reference/core/#tablite.core.Table.__imul__","title":"tablite.core.Table.__imul__(other) ","text":"Repeats instance of table N times. Like list: t = t * N PARAMETER DESCRIPTION other multiplier TYPE: int Source code in tablite/base.py def __imul__(self, other):\n \"\"\"Repeats instance of table N times.\n\n Like list: `t = t * N`\n\n Args:\n other (int): multiplier\n \"\"\"\n if not (isinstance(other, int) and other > 0):\n raise TypeError(\n f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n )\n for col in self.columns.values():\n col *= other\n return self\n "},{"location":"reference/core/#tablite.core.Table.__mul__","title":"tablite.core.Table.__mul__(other) ","text":"Repeat table N times. Like list: new = old * N PARAMETER DESCRIPTION other multiplier TYPE: int RETURNS DESCRIPTION Table Source code in tablite/base.py def __mul__(self, other):\n \"\"\"Repeat table N times.\n Like list: `new = old * N`\n\n Args:\n other (int): multiplier\n\n Returns:\n Table\n \"\"\"\n new = self.copy()\n return new.__imul__(other)\n "},{"location":"reference/core/#tablite.core.Table.__iadd__","title":"tablite.core.Table.__iadd__(other) ","text":"Concatenates tables with same column names. Like list: table_1 += table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION None self is updated. Source code in tablite/base.py def __iadd__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_1 += table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n None: self is updated.\n \"\"\"\n type_check(other, BaseTable)\n for name in self.columns.keys():\n if name not in other.columns:\n raise ValueError(f\"{name} not in other\")\n for name in other.columns.keys():\n if name not in self.columns:\n raise ValueError(f\"{name} missing from self\")\n\n for name, column in self.columns.items():\n other_col = other.columns.get(name, None)\n column.pages.extend(other_col.pages[:])\n return self\n "},{"location":"reference/core/#tablite.core.Table.__add__","title":"tablite.core.Table.__add__(other) ","text":"Concatenates tables with same column names. Like list: table_3 = table_1 + table_2 RAISES DESCRIPTION ValueError If column names don't match. RETURNS DESCRIPTION Table Source code in tablite/base.py def __add__(self, other):\n \"\"\"Concatenates tables with same column names.\n\n Like list: `table_3 = table_1 + table_2`\n\n Args:\n other (Table)\n\n Raises:\n ValueError: If column names don't match.\n\n Returns:\n Table\n \"\"\"\n type_check(other, BaseTable)\n cp = self.copy()\n cp += other\n return cp\n "},{"location":"reference/core/#tablite.core.Table.add_rows","title":"tablite.core.Table.add_rows(*args, **kwargs) ","text":"its more efficient to add many rows at once. if both args and kwargs, then args are added first, followed by kwargs. supported cases: >>> t = Table()\n>>> t.add_columns('row','A','B','C')\n>>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n>>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n>>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n>>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n>>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n>>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n>>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n>>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n>>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n>>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n>>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n Source code in tablite/base.py def add_rows(self, *args, **kwargs):\n \"\"\"its more efficient to add many rows at once.\n\n if both args and kwargs, then args are added first, followed by kwargs.\n\n supported cases:\n ```\n >>> t = Table()\n >>> t.add_columns('row','A','B','C')\n >>> t.add_rows(1, 1, 2, 3) # (1) individual values as args\n >>> t.add_rows([2, 1, 2, 3]) # (2) list of values as args\n >>> t.add_rows((3, 1, 2, 3)) # (3) tuple of values as args\n >>> t.add_rows(*(4, 1, 2, 3)) # (4) unpacked tuple becomes arg like (1)\n >>> t.add_rows(row=5, A=1, B=2, C=3) # (5) kwargs\n >>> t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3}) # (6) dict / json interpreted a kwargs\n >>> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6)) # (7) two (or more) tuples as args\n >>> t.add_rows([9, 1, 2, 3], [10, 4, 5, 6]) # (8) two or more lists as rgs\n >>> t.add_rows(\n {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n ) # (9) two (or more) dicts as args - roughly comma sep'd json.\n >>> t.add_rows( *[\n {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n ]) # (10) list of dicts as args\n >>> t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3]) # (11) kwargs with lists as values\n ```\n\n \"\"\"\n if not BaseTable._add_row_slow_warning:\n warnings.warn(\n \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n )\n BaseTable._add_row_slow_warning = True\n\n if args:\n if not all(isinstance(i, (list, tuple, dict)) for i in args): # 1,4\n args = [args]\n\n if all(isinstance(i, (list, tuple, dict)) for i in args): # 2,3,7,8\n # 1. turn the data into columns:\n\n d = {n: [] for n in self.columns}\n for arg in args:\n if len(arg) != len(self.columns):\n raise ValueError(\n f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n )\n\n if isinstance(arg, dict):\n for k, v in arg.items(): # 7,8\n d[k].append(v)\n\n elif isinstance(arg, (list, tuple)): # 2,3\n for n, v in zip(self.columns, arg):\n d[n].append(v)\n\n else:\n raise TypeError(f\"{arg}?\")\n # 2. extend the columns\n for n, values in d.items():\n col = self.columns[n]\n col.extend(list_to_np_array(values))\n\n if kwargs:\n if isinstance(kwargs, dict):\n if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(list_to_np_array(v))\n else:\n for k, v in kwargs.items():\n col = self.columns[k]\n col.extend(np.array([v]))\n else:\n raise ValueError(f\"format not recognised: {kwargs}\")\n\n return\n "},{"location":"reference/core/#tablite.core.Table.add_columns","title":"tablite.core.Table.add_columns(*names) ","text":"Adds column names to table. Source code in tablite/base.py def add_columns(self, *names):\n \"\"\"Adds column names to table.\"\"\"\n for name in names:\n self.columns[name] = Column(self.path)\n "},{"location":"reference/core/#tablite.core.Table.add_column","title":"tablite.core.Table.add_column(name, data=None) ","text":"verbose alias for table[name] = data, that checks if name already exists PARAMETER DESCRIPTION name column name TYPE: str data values. Defaults to None. TYPE: list,tuple) DEFAULT: None RAISES DESCRIPTION TypeError name isn't string ValueError name already exists Source code in tablite/base.py def add_column(self, name, data=None):\n \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n Args:\n name (str): column name\n data ((list,tuple), optional): values. Defaults to None.\n\n Raises:\n TypeError: name isn't string\n ValueError: name already exists\n \"\"\"\n if not isinstance(name, str):\n raise TypeError(\"expected name as string\")\n if name in self.columns:\n raise ValueError(f\"{name} already in {self.columns}\")\n self.__setitem__(name, data)\n "},{"location":"reference/core/#tablite.core.Table.stack","title":"tablite.core.Table.stack(other) ","text":"returns the joint stack of tables with overlapping column names. Example: | Table A| + | Table B| = | Table AB |\n| A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n Source code in tablite/base.py def stack(self, other):\n \"\"\"\n returns the joint stack of tables with overlapping column names.\n Example:\n ```\n | Table A| + | Table B| = | Table AB |\n | A| B| C| | A| B| D| | A| B| C| -|\n | A| B| -| D|\n ```\n \"\"\"\n if not isinstance(other, BaseTable):\n raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n cp = self.copy()\n for name, col2 in other.columns.items():\n if name not in cp.columns:\n cp[name] = [None] * len(self)\n cp[name].pages.extend(col2.pages[:])\n\n for name in self.columns:\n if name not in other.columns:\n if len(cp) > 0:\n cp[name].extend(np.array([None] * len(other)))\n return cp\n "},{"location":"reference/core/#tablite.core.Table.types","title":"tablite.core.Table.types() ","text":"returns nested dict of data types in the form: {column name: {python type class: number of instances }, ... } example: >>> t.types()\n{\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n}\n Source code in tablite/base.py def types(self):\n \"\"\"\n returns nested dict of data types in the form:\n `{column name: {python type class: number of instances }, ... }`\n\n example:\n ```\n >>> t.types()\n {\n 'A': {<class 'str'>: 7},\n 'B': {<class 'int'>: 7}\n }\n ```\n \"\"\"\n d = {}\n for name, col in self.columns.items():\n assert isinstance(col, Column)\n d[name] = col.types()\n return d\n "},{"location":"reference/core/#tablite.core.Table.display_dict","title":"tablite.core.Table.display_dict(slice_=None, blanks=None, dtype=False) ","text":"helper for creating dict for display. PARAMETER DESCRIPTION slice_ python slice. Defaults to None. TYPE: slice DEFAULT: None blanks fill value for None . Defaults to None. TYPE: optional DEFAULT: None dtype Adds datatype to each column. Defaults to False. TYPE: bool DEFAULT: False RAISES DESCRIPTION TypeError slice_ must be None or slice. RETURNS DESCRIPTION dict from Table. Source code in tablite/base.py def display_dict(self, slice_=None, blanks=None, dtype=False):\n \"\"\"helper for creating dict for display.\n\n Args:\n slice_ (slice, optional): python slice. Defaults to None.\n blanks (optional): fill value for `None`. Defaults to None.\n dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n Raises:\n TypeError: slice_ must be None or slice.\n\n Returns:\n dict: from Table.\n \"\"\"\n if not self.columns:\n print(\"Empty Table\")\n return\n\n def datatype(col): # PRIVATE\n \"\"\"creates label for column datatype.\"\"\"\n types = col.types()\n if len(types) == 0:\n typ = \"empty\"\n elif len(types) == 1:\n dt, _ = types.popitem()\n typ = dt.__name__\n else:\n typ = \"mixed\"\n return typ\n\n row_count_tags = [\"#\", \"~\", \"*\"]\n cols = set(self.columns)\n for n, tag in product(range(1, 6), row_count_tags):\n if n * tag not in cols:\n tag = n * tag\n break\n\n if not isinstance(slice_, (slice, type(None))):\n raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n if isinstance(slice_, slice):\n slc = slice_\n if slice_ is None:\n if len(self) <= 20:\n slc = slice(0, 20, 1)\n else:\n slc = None\n\n n = len(self)\n if slc: # either we want slc or we want everything.\n row_no = list(range(*slc.indices(len(self))))\n data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n for name, col in self.columns.items():\n data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n slc\n ]\n else:\n data = {}\n j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n row_no = (\n [f\"{i:,}\".rjust(j) for i in range(7)]\n + [\"...\"]\n + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n )\n data = {tag: row_no}\n\n for name, col in self.columns.items():\n if len(col) == n:\n row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n else:\n empty = [blanks] * 7\n head = (col[:7].tolist() + empty)[:7]\n tail = (col[n - 7 :].tolist() + empty)[-7:]\n row = head + [\"...\"] + tail\n data[name] = row\n\n if dtype:\n for name, values in data.items():\n if name in self.columns:\n col = self.columns[name]\n values.insert(0, datatype(col))\n else:\n values.insert(0, \"row\")\n\n return data\n "},{"location":"reference/core/#tablite.core.Table.to_ascii","title":"tablite.core.Table.to_ascii(slice_=None, blanks=None, dtype=False) ","text":"returns ascii view of table as string. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def to_ascii(self, slice_=None, blanks=None, dtype=False):\n \"\"\"returns ascii view of table as string.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n\n def adjust(v, length): # PRIVATE FUNCTION\n \"\"\"whitespace justifies field values based on datatype\"\"\"\n if v is None:\n return str(blanks).ljust(length)\n elif isinstance(v, str):\n return v.ljust(length)\n else:\n return str(v).rjust(length)\n\n if not self.columns:\n return str(self)\n\n d = {}\n for name, values in self.display_dict(\n slice_=slice_, blanks=blanks, dtype=dtype\n ).items():\n as_text = [str(v) for v in values] + [str(name)]\n width = max(len(i) for i in as_text)\n new_name = name.center(width, \" \")\n if dtype:\n values[0] = values[0].center(width, \" \")\n d[new_name] = [adjust(v, width) for v in values]\n\n rows = dict_to_rows(d)\n s = []\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n s.append(\"|\" + \"|\".join(rows[0]) + \"|\") # column names\n start = 1\n if dtype:\n s.append(\"|\" + \"|\".join(rows[1]) + \"|\") # datatypes\n start = 2\n\n s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n for row in rows[start:]:\n s.append(\"|\" + \"|\".join(row) + \"|\")\n s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n if len(set(len(c) for c in self.columns.values())) != 1:\n warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n s.append(warning)\n\n return \"\\n\".join(s)\n "},{"location":"reference/core/#tablite.core.Table.show","title":"tablite.core.Table.show(slice_=None, blanks=None, dtype=False) ","text":"prints ascii view of table. PARAMETER DESCRIPTION slice_ slice to determine table snippet. TYPE: slice DEFAULT: None blanks value for whitespace. Defaults to None. TYPE: str DEFAULT: None dtype adds subheader with datatype for column. Defaults to False. TYPE: bool DEFAULT: False Source code in tablite/base.py def show(self, slice_=None, blanks=None, dtype=False):\n \"\"\"prints ascii view of table.\n\n Args:\n slice_ (slice, optional): slice to determine table snippet.\n blanks (str, optional): value for whitespace. Defaults to None.\n dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n \"\"\"\n print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n "},{"location":"reference/core/#tablite.core.Table.to_dict","title":"tablite.core.Table.to_dict(columns=None, slice_=None) ","text":"columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows. returns: dict with columns as keys and lists of values. Example: >>> t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 | 1| 3|\n| 1 | 2| 4|\n+===+===+===+\n>>> t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n Source code in tablite/base.py def to_dict(self, columns=None, slice_=None):\n \"\"\"\n columns: list of column names. Default is None == all columns.\n slice_: slice. Default is None == all rows.\n\n returns: dict with columns as keys and lists of values.\n\n Example:\n ```\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 3|\n | 1 | 2| 4|\n +===+===+===+\n >>> t.to_dict()\n {'a':[1,2], 'b':[3,4]}\n ```\n\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n assert isinstance(slice_, slice)\n\n if columns is None:\n columns = list(self.columns.keys())\n if not isinstance(columns, list):\n raise TypeError(\"expected columns as list of strings\")\n\n return {name: list(self.columns[name][slice_]) for name in columns}\n "},{"location":"reference/core/#tablite.core.Table.as_json_serializable","title":"tablite.core.Table.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None) ","text":"provides a JSON compatible format of the table. PARAMETER DESCRIPTION row_count Label for row counts. Defaults to \"row id\". TYPE: str DEFAULT: 'row id' start_on row counts starts by default on 1. TYPE: int DEFAULT: 1 columns Column names. Defaults to None which returns all columns. TYPE: list of str DEFAULT: None slice_ selector. Defaults to None which returns [:] TYPE: slice DEFAULT: None RETURNS DESCRIPTION JSON serializable dict: All python datatypes have been converted to JSON compliant data. Source code in tablite/base.py def as_json_serializable(\n self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n \"\"\"provides a JSON compatible format of the table.\n\n Args:\n row_count (str, optional): Label for row counts. Defaults to \"row id\".\n start_on (int, optional): row counts starts by default on 1.\n columns (list of str, optional): Column names.\n Defaults to None which returns all columns.\n slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n Returns:\n JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n \"\"\"\n if slice_ is None:\n slice_ = slice(0, len(self))\n\n assert isinstance(slice_, slice)\n new = {\"columns\": {}, \"total_rows\": len(self)}\n if row_count is not None:\n new[\"columns\"][row_count] = [\n i + start_on for i in range(*slice_.indices(len(self)))\n ]\n\n d = self.to_dict(columns, slice_=slice_)\n for k, data in d.items():\n new_k = unique_name(\n k, new[\"columns\"]\n ) # used to avoid overwriting the `row id` key.\n new[\"columns\"][new_k] = [\n DataTypes.to_json(v) for v in data\n ] # deal with non-json datatypes.\n return new\n "},{"location":"reference/core/#tablite.core.Table.index","title":"tablite.core.Table.index(*args) ","text":"param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...} Examples: >>> table6 = Table()\n>>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n>>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n >>> table6.index('A') # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n >>> table6.index('A', 'B') # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n Source code in tablite/base.py def index(self, *args):\n \"\"\"\n param: *args: column names\n returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n Examples:\n ```\n >>> table6 = Table()\n >>> table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n >>> table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n ```\n\n ```\n >>> table6.index('A') # single key.\n {('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n ```\n\n ```\n >>> table6.index('A', 'B') # multiple keys.\n {('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n ```\n\n \"\"\"\n idx = defaultdict(list)\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in enumerate(zip(*iterators)):\n key = tuple(numpy_to_python(k) for k in key)\n idx[key].append(ix)\n return idx\n "},{"location":"reference/core/#tablite.core.Table.unique_index","title":"tablite.core.Table.unique_index(*args, tqdm=_tqdm) ","text":"generates the index of unique rows given a list of column names PARAMETER DESCRIPTION *args columns names TYPE: any DEFAULT: () tqdm Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION np.array(int64): indices of unique records. Source code in tablite/base.py def unique_index(self, *args, tqdm=_tqdm):\n \"\"\"generates the index of unique rows given a list of column names\n\n Args:\n *args (any): columns names\n tqdm (tqdm, optional): Defaults to _tqdm.\n\n Returns:\n np.array(int64): indices of unique records.\n \"\"\"\n if not args:\n raise ValueError(\"*args (column names) is required\")\n seen = set()\n unique = set()\n iterators = [iter(self.columns[c]) for c in args]\n for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n key_hash = hash(tuple(numpy_to_python(k) for k in key))\n if key_hash in seen:\n continue\n else:\n seen.add(key_hash)\n unique.add(ix)\n return np.array(sorted(unique))\n "},{"location":"reference/core/#tablite.core.Table.from_file","title":"tablite.core.Table.from_file(path, columns=None, first_row_has_headers=True, header_row_index=0, encoding=None, start=0, limit=sys.maxsize, sheet=None, guess_datatypes=True, newline='\\n', text_qualifier=None, delimiter=None, strip_leading_and_tailing_whitespace=True, text_escape_openings='', text_escape_closures='', skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -> Table classmethod ","text":" reads path and imports 1 or more tables\n\n REQUIRED\n --------\n path: pathlib.Path or str\n selection of filereader uses path.suffix.\n See `filereaders`.\n\n OPTIONAL\n --------\n columns:\n None: (default) All columns will be imported.\n List: only column names from list will be imported (if present in file)\n e.g. ['A', 'B', 'C', 'D']\n\n datatype is detected using Datatypes.guess(...)\n You can try it out with:\n >> from tablite.datatypes import DataTypes\n >> DataTypes.guess(['001','100'])\n [1,100]\n\n if the format cannot be achieved the read type is kept.\n Excess column names are ignored.\n\n HINT: To get the head of file use:\n >>> from tablite.tools import head\n >>> head = head(path)\n\n first_row_has_headers: boolean\n True: (default) first row is used as column names.\n False: integers are used as column names.\n\n encoding: str. Defaults to None (autodetect using n bytes).\n n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n start: the first line to be read (default: 0)\n\n limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n OPTIONAL FOR EXCEL AND ODS READERS\n ----------------------------------\n\n sheet: sheet name to import (applicable to excel- and ods-reader only)\n e.g. 'sheet_1'\n sheets not found excess names are ignored.\n\n OPTIONAL FOR TEXT READERS\n -------------------------\n guess_datatype: bool\n True: (default) datatypes are guessed using DataTypes.guess(...)\n False: all data is imported as strings.\n\n newline: newline character (applicable to text_reader only)\n str: '\n ' (default) or ' ' text_qualifier: character (applicable to text_reader only)\n None: No text qualifier is used.\n str: \" or '\n\n delimiter: character (applicable to text_reader only)\n None: file suffix is used to determine field delimiter:\n .txt: \"|\"\n .csv: \",\",\n .ssv: \";\"\n .tsv: \" \" (tab)\n\n strip_leading_and_tailing_whitespace: bool:\n True: default\n\n text_escape_openings: (applicable to text_reader only)\n None: default\n str: list of characters such as ([{\n\n text_escape_closures: (applicable to text_reader only)\n None: default\n str: list of characters such as }])\n Source code in tablite/core.py @classmethod\ndef from_file(\n cls,\n path,\n columns=None,\n first_row_has_headers=True,\n header_row_index=0,\n encoding=None,\n start=0,\n limit=sys.maxsize,\n sheet=None,\n guess_datatypes=True,\n newline=\"\\n\",\n text_qualifier=None,\n delimiter=None,\n strip_leading_and_tailing_whitespace=True,\n text_escape_openings=\"\",\n text_escape_closures=\"\",\n skip_empty: ValidSkipEmpty=\"NONE\",\n tqdm=_tqdm,\n) -> \"Table\":\n \"\"\"\n reads path and imports 1 or more tables\n\n REQUIRED\n --------\n path: pathlib.Path or str\n selection of filereader uses path.suffix.\n See `filereaders`.\n\n OPTIONAL\n --------\n columns:\n None: (default) All columns will be imported.\n List: only column names from list will be imported (if present in file)\n e.g. ['A', 'B', 'C', 'D']\n\n datatype is detected using Datatypes.guess(...)\n You can try it out with:\n >> from tablite.datatypes import DataTypes\n >> DataTypes.guess(['001','100'])\n [1,100]\n\n if the format cannot be achieved the read type is kept.\n Excess column names are ignored.\n\n HINT: To get the head of file use:\n >>> from tablite.tools import head\n >>> head = head(path)\n\n first_row_has_headers: boolean\n True: (default) first row is used as column names.\n False: integers are used as column names.\n\n encoding: str. Defaults to None (autodetect using n bytes).\n n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n start: the first line to be read (default: 0)\n\n limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n OPTIONAL FOR EXCEL AND ODS READERS\n ----------------------------------\n\n sheet: sheet name to import (applicable to excel- and ods-reader only)\n e.g. 'sheet_1'\n sheets not found excess names are ignored.\n\n OPTIONAL FOR TEXT READERS\n -------------------------\n guess_datatype: bool\n True: (default) datatypes are guessed using DataTypes.guess(...)\n False: all data is imported as strings.\n\n newline: newline character (applicable to text_reader only)\n str: '\\n' (default) or '\\r\\n'\n\n text_qualifier: character (applicable to text_reader only)\n None: No text qualifier is used.\n str: \" or '\n\n delimiter: character (applicable to text_reader only)\n None: file suffix is used to determine field delimiter:\n .txt: \"|\"\n .csv: \",\",\n .ssv: \";\"\n .tsv: \"\\t\" (tab)\n\n strip_leading_and_tailing_whitespace: bool:\n True: default\n\n text_escape_openings: (applicable to text_reader only)\n None: default\n str: list of characters such as ([{\n\n text_escape_closures: (applicable to text_reader only)\n None: default\n str: list of characters such as }])\n\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n type_check(path, Path)\n\n if not path.exists():\n raise FileNotFoundError(f\"file not found: {path}\")\n\n if not isinstance(start, int) or not 0 <= start <= sys.maxsize:\n raise ValueError(f\"start {start} not in range(0,{sys.maxsize})\")\n\n if not isinstance(limit, int) or not 0 < limit <= sys.maxsize:\n raise ValueError(f\"limit {limit} not in range(0,{sys.maxsize})\")\n\n if not isinstance(first_row_has_headers, bool):\n raise TypeError(\"first_row_has_headers is not bool\")\n\n import_as = path.suffix\n if import_as.startswith(\".\"):\n import_as = import_as[1:]\n\n reader = import_utils.file_readers.get(import_as, None)\n if reader is None:\n raise ValueError(f\"{import_as} is not in supported format: {import_utils.valid_readers}\")\n\n additional_configs = {\"tqdm\": tqdm}\n if reader == import_utils.text_reader:\n # here we inject tqdm, if tqdm is not provided, use generic iterator\n # fmt:off\n config = (path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline,\n guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty,\n delimiter, text_escape_openings, text_escape_closures)\n # fmt:on\n\n elif reader == import_utils.from_html:\n config = (path,)\n elif reader == import_utils.from_hdf5:\n config = (path,)\n\n elif reader == import_utils.excel_reader:\n # config = path, first_row_has_headers, sheet, columns, start, limit\n config = (\n path,\n first_row_has_headers,\n header_row_index,\n sheet,\n columns,\n skip_empty,\n start,\n limit,\n ) # if file length changes - re-import.\n\n if reader == import_utils.ods_reader:\n # path, first_row_has_headers=True, sheet=None, columns=None, start=0, limit=sys.maxsize,\n config = (\n str(path),\n first_row_has_headers,\n header_row_index,\n sheet,\n columns,\n skip_empty,\n start,\n limit,\n ) # if file length changes - re-import.\n\n # At this point the import config seems valid.\n # Now we check if the file already has been imported.\n\n # publish the settings\n return reader(cls, *config, **additional_configs)\n "},{"location":"reference/core/#tablite.core.Table.from_pandas","title":"tablite.core.Table.from_pandas(df) classmethod ","text":"Creates Table using pd.to_dict('list') similar to: >>> import pandas as pd\n>>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n>>> df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n>>> df.to_dict('list')\n{'a': [1, 2, 3], 'b': [4, 5, 6]}\n>>> t = Table.from_dict(df.to_dict('list))\n>>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 4|\n | 1 | 2| 5|\n | 2 | 3| 6|\n +===+===+===+\n Source code in tablite/core.py @classmethod\ndef from_pandas(cls, df):\n \"\"\"\n Creates Table using pd.to_dict('list')\n\n similar to:\n ```\n >>> import pandas as pd\n >>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n >>> df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n >>> df.to_dict('list')\n {'a': [1, 2, 3], 'b': [4, 5, 6]}\n >>> t = Table.from_dict(df.to_dict('list))\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 4|\n | 1 | 2| 5|\n | 2 | 3| 6|\n +===+===+===+\n ```\n \"\"\"\n return import_utils.from_pandas(cls, df)\n "},{"location":"reference/core/#tablite.core.Table.from_hdf5","title":"tablite.core.Table.from_hdf5(path) classmethod ","text":"imports an exported hdf5 table. Source code in tablite/core.py @classmethod\ndef from_hdf5(cls, path):\n \"\"\"\n imports an exported hdf5 table.\n \"\"\"\n return import_utils.from_hdf5(cls, path)\n "},{"location":"reference/core/#tablite.core.Table.from_json","title":"tablite.core.Table.from_json(jsn) classmethod ","text":"Imports table exported using .to_json Source code in tablite/core.py @classmethod\ndef from_json(cls, jsn):\n \"\"\"\n Imports table exported using .to_json\n \"\"\"\n return import_utils.from_json(cls, jsn)\n "},{"location":"reference/core/#tablite.core.Table.to_hdf5","title":"tablite.core.Table.to_hdf5(path) ","text":"creates a copy of the table as hdf5 Source code in tablite/core.py def to_hdf5(self, path):\n \"\"\"\n creates a copy of the table as hdf5\n \"\"\"\n export_utils.to_hdf5(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_pandas","title":"tablite.core.Table.to_pandas() ","text":"returns pandas.DataFrame Source code in tablite/core.py def to_pandas(self):\n \"\"\"\n returns pandas.DataFrame\n \"\"\"\n return export_utils.to_pandas(self)\n "},{"location":"reference/core/#tablite.core.Table.to_sql","title":"tablite.core.Table.to_sql(name) ","text":"generates ANSI-92 compliant SQL. Source code in tablite/core.py def to_sql(self, name):\n \"\"\"\n generates ANSI-92 compliant SQL.\n \"\"\"\n return export_utils.to_sql(self, name) # remove after update to test suite.\n "},{"location":"reference/core/#tablite.core.Table.to_json","title":"tablite.core.Table.to_json() ","text":"returns JSON Source code in tablite/core.py def to_json(self):\n \"\"\"\n returns JSON\n \"\"\"\n return export_utils.to_json(self)\n "},{"location":"reference/core/#tablite.core.Table.to_xlsx","title":"tablite.core.Table.to_xlsx(path) ","text":"exports table to path Source code in tablite/core.py def to_xlsx(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".xlsx\")\n export_utils.excel_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_ods","title":"tablite.core.Table.to_ods(path) ","text":"exports table to path Source code in tablite/core.py def to_ods(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".ods\")\n export_utils.excel_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_csv","title":"tablite.core.Table.to_csv(path) ","text":"exports table to path Source code in tablite/core.py def to_csv(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".csv\")\n export_utils.text_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_tsv","title":"tablite.core.Table.to_tsv(path) ","text":"exports table to path Source code in tablite/core.py def to_tsv(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".tsv\")\n export_utils.text_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_text","title":"tablite.core.Table.to_text(path) ","text":"exports table to path Source code in tablite/core.py def to_text(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".txt\")\n export_utils.text_writer(self, path)\n "},{"location":"reference/core/#tablite.core.Table.to_html","title":"tablite.core.Table.to_html(path) ","text":"exports table to path Source code in tablite/core.py def to_html(self, path):\n \"\"\"\n exports table to path\n \"\"\"\n export_utils.path_suffix_check(path, \".html\")\n export_utils.to_html(self, path)\n "},{"location":"reference/core/#tablite.core.Table.expression","title":"tablite.core.Table.expression(expression) ","text":"filters based on an expression, such as: \"all((A==B, C!=4, 200<D))\"\n which is interpreted using python's compiler to: def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n Source code in tablite/core.py def expression(self, expression):\n \"\"\"\n filters based on an expression, such as:\n\n \"all((A==B, C!=4, 200<D))\"\n\n which is interpreted using python's compiler to:\n\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n \"\"\"\n return redux._filter_using_expression(self, expression)\n "},{"location":"reference/core/#tablite.core.Table.filter","title":"tablite.core.Table.filter(expressions, filter_type='all', tqdm=_tqdm) ","text":"enables filtering across columns for multiple criteria. expressions: str: Expression that can be compiled and executed row by row.\n exampLe: \"all((A==B and C!=4 and 200<D))\"\n\nlist of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\naccepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n filter_type: 'all' or 'any' Source code in tablite/core.py def filter(self, expressions, filter_type=\"all\", tqdm=_tqdm):\n \"\"\"\n enables filtering across columns for multiple criteria.\n\n expressions:\n\n str: Expression that can be compiled and executed row by row.\n exampLe: \"all((A==B and C!=4 and 200<D))\"\n\n list of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\n accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n filter_type: 'all' or 'any'\n \"\"\"\n return redux.filter(self, expressions, filter_type, tqdm)\n "},{"location":"reference/core/#tablite.core.Table.sort_index","title":"tablite.core.Table.sort_index(sort_mode='excel', tqdm=_tqdm, pbar=None, **kwargs) ","text":"helper for methods sort and is_sorted param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort() Source code in tablite/core.py def sort_index(self, sort_mode=\"excel\", tqdm=_tqdm, pbar=None, **kwargs):\n \"\"\"\n helper for methods `sort` and `is_sorted`\n\n param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n param: **kwargs: sort criteria. See Table.sort()\n \"\"\"\n return sortation.sort_index(self, sort_mode, tqdm=tqdm, pbar=pbar, **kwargs)\n "},{"location":"reference/core/#tablite.core.Table.reindex","title":"tablite.core.Table.reindex(index) ","text":"index: list of integers that declare sort order. Examples: Table: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n Source code in tablite/core.py def reindex(self, index):\n \"\"\"\n index: list of integers that declare sort order.\n\n Examples:\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6]\n result: ['b','d','f','h']\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6,1,3,5,7]\n result: ['a','c','e','g','b','d','f','h']\n\n \"\"\"\n if isinstance(index, list):\n index = np.array(index)\n return _reindex.reindex(self, index)\n "},{"location":"reference/core/#tablite.core.Table.drop_duplicates","title":"tablite.core.Table.drop_duplicates(*args) ","text":"removes duplicate rows based on column names args: (optional) column_names if no args, all columns are used. Source code in tablite/core.py def drop_duplicates(self, *args):\n \"\"\"\n removes duplicate rows based on column names\n\n args: (optional) column_names\n if no args, all columns are used.\n \"\"\"\n if not args:\n args = self.columns\n index = self.unique_index(*args)\n return self.reindex(index)\n "},{"location":"reference/core/#tablite.core.Table.sort","title":"tablite.core.Table.sort(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None) ","text":"Perform multi-pass sorting with precedence given order of column names. PARAMETER DESCRIPTION mapping keys as columns, values as boolean for 'reverse' TYPE: dict sort_mode str: \"alphanumeric\", \"unix\", or, \"excel\" DEFAULT: 'excel' RETURNS DESCRIPTION None Table.sort is sorted inplace Examples: Table.sort(mappinp={A':False}) means sort by 'A' in ascending order. Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority) sort B in ascending order. Source code in tablite/core.py def sort(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n \"\"\"Perform multi-pass sorting with precedence given order of column names.\n\n Args:\n mapping (dict): keys as columns,\n values as boolean for 'reverse'\n sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n\n Returns:\n None: Table.sort is sorted inplace\n\n Examples:\n Table.sort(mappinp={A':False}) means sort by 'A' in ascending order.\n Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority)\n sort B in ascending order.\n \"\"\"\n new = sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n self.columns = new.columns\n "},{"location":"reference/core/#tablite.core.Table.sorted","title":"tablite.core.Table.sorted(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None) ","text":"See sort. Sorted returns a new table in contrast to \"sort\", which is in-place. RETURNS DESCRIPTION Table. Source code in tablite/core.py def sorted(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n \"\"\"See sort.\n Sorted returns a new table in contrast to \"sort\", which is in-place.\n\n Returns:\n Table.\n \"\"\"\n return sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.is_sorted","title":"tablite.core.Table.is_sorted(mapping, sort_mode='excel') ","text":"Performs multi-pass sorting check with precedence given order of column names. **kwargs: optional: sort criteria. See Table.sort() :return bool Source code in tablite/core.py def is_sorted(self, mapping, sort_mode=\"excel\"):\n \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n **kwargs: optional: sort criteria. See Table.sort()\n :return bool\n \"\"\"\n return sortation.is_sorted(self, mapping, sort_mode)\n "},{"location":"reference/core/#tablite.core.Table.any","title":"tablite.core.Table.any(**kwargs) ","text":"returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable Source code in tablite/core.py def any(self, **kwargs):\n \"\"\"\n returns Table for rows where ANY kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n \"\"\"\n return redux.filter_any(self, **kwargs)\n "},{"location":"reference/core/#tablite.core.Table.all","title":"tablite.core.Table.all(**kwargs) ","text":"returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable Examples: t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n return x == 4\ndef g(x):\n return x < 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n return x>=2\n\ndef i(x):\n return x<=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n Source code in tablite/core.py def all(self, **kwargs):\n \"\"\"\n returns Table for rows where ALL kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n\n Examples:\n\n t = Table()\n t['a'] = [1,2,3,4]\n t['b'] = [10,20,30,40]\n\n def f(x):\n return x == 4\n def g(x):\n return x < 20\n\n t2 = t.any( **{\"a\":f, \"b\":g})\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n t2 = t.any(a=f,b=g)\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n def h(x):\n return x>=2\n\n def i(x):\n return x<=30\n\n t2 = t.all(a=h,b=i)\n assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n \"\"\"\n return redux.filter_all(self, **kwargs)\n "},{"location":"reference/core/#tablite.core.Table.drop","title":"tablite.core.Table.drop(*args) ","text":"removes all rows where args are present. Exmaple: t = Table() t['A'] = [1,2,3,None] t['B'] = [None,2,3,4] t2 = t.drop(None) t2'A', t2'B' ([2,3], [2,3]) Source code in tablite/core.py def drop(self, *args):\n \"\"\"\n removes all rows where args are present.\n\n Exmaple:\n >>> t = Table()\n >>> t['A'] = [1,2,3,None]\n >>> t['B'] = [None,2,3,4]\n >>> t2 = t.drop(None)\n >>> t2['A'][:], t2['B'][:]\n ([2,3], [2,3])\n\n \"\"\"\n if not args:\n raise ValueError(\"What to drop? None? np.nan? \")\n return redux.drop(self, *args)\n "},{"location":"reference/core/#tablite.core.Table.replace","title":"tablite.core.Table.replace(mapping, columns=None, tqdm=_tqdm, pbar=None) ","text":"replaces all mapped keys with values from named columns PARAMETER DESCRIPTION mapping keys are targets for replacement, values are replacements. TYPE: dict columns target columns. Defaults to None (all columns) TYPE: list or str DEFAULT: None RAISES DESCRIPTION ValueError description Source code in tablite/core.py def replace(self, mapping, columns=None, tqdm=_tqdm, pbar=None):\n \"\"\"replaces all mapped keys with values from named columns\n\n Args:\n mapping (dict): keys are targets for replacement,\n values are replacements.\n columns (list or str, optional): target columns.\n Defaults to None (all columns)\n\n Raises:\n ValueError: _description_\n \"\"\"\n if columns is None:\n columns = list(self.columns)\n if not isinstance(columns, list) and columns in self.columns:\n columns = [columns]\n type_check(columns, list)\n for n in columns:\n if n not in self.columns:\n raise ValueError(f\"column not found: {n}\")\n\n if pbar is None:\n total = len(columns)\n pbar = tqdm(total=total, desc=\"replace\", disable=Config.TQDM_DISABLE)\n\n for name in columns:\n col = self.columns[name]\n col.replace(mapping)\n pbar.update(1)\n "},{"location":"reference/core/#tablite.core.Table.groupby","title":"tablite.core.Table.groupby(keys, functions, tqdm=_tqdm, pbar=None) ","text":"keys: column names for grouping. functions: [optional] list of column names and group functions (See GroupyBy class) returns: table Example: t = Table()\nt.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\nt.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\nt.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\nt.show()\n+=====+=====+=====+\n| A | B | C |\n| int | int | int |\n+-----+-----+-----+\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n+=====+=====+=====+\n\ng = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\ng.show()\n+===+===+===+======+\n| # | A | C |Sum(B)|\n|row|int|int| int |\n+---+---+---+------+\n|0 | 1| 6| 2|\n|1 | 1| 5| 4|\n|2 | 2| 4| 6|\n|3 | 2| 3| 8|\n|4 | 3| 2| 10|\n|5 | 3| 1| 12|\n+===+===+===+======+\n Cheat sheet: list of unique values >>> g1 = t.groupby(keys=['A'], functions=[])\n>>> g1['A'][:]\n[1,2,3]\n alternatively: t['A'].unique() [1,2,3] list of unique values, grouped by longest combination. >>> g2 = t.groupby(keys=['A', 'B'], functions=[])\n>>> g2['A'][:], g2['B'][:]\n([1,1,2,2,3,3], [1,2,3,4,5,6])\n alternatively: >>> list(zip(*t.index('A', 'B').keys()))\n[(1,1,2,2,3,3) (1,2,3,4,5,6)]\n A key (unique values) and count hereof. >>> g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n>>> g3['A'][:], g3['Count(A)'][:]\n([1,2,3], [4,4,4])\n alternatively: >>> t['A'].histogram()\n([1,2,3], [4,4,4])\n for more exmaples see: https://github.com/root-11/tablite/blob/master/tests/test_groupby.py Source code in tablite/core.py def groupby(self, keys, functions, tqdm=_tqdm, pbar=None):\n \"\"\"\n keys: column names for grouping.\n functions: [optional] list of column names and group functions (See GroupyBy class)\n returns: table\n\n Example:\n ```\n t = Table()\n t.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\n t.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\n t.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\n t.show()\n +=====+=====+=====+\n | A | B | C |\n | int | int | int |\n +-----+-----+-----+\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n +=====+=====+=====+\n\n g = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\n g.show()\n +===+===+===+======+\n | # | A | C |Sum(B)|\n |row|int|int| int |\n +---+---+---+------+\n |0 | 1| 6| 2|\n |1 | 1| 5| 4|\n |2 | 2| 4| 6|\n |3 | 2| 3| 8|\n |4 | 3| 2| 10|\n |5 | 3| 1| 12|\n +===+===+===+======+\n ```\n Cheat sheet:\n\n list of unique values\n ```\n >>> g1 = t.groupby(keys=['A'], functions=[])\n >>> g1['A'][:]\n [1,2,3]\n ```\n alternatively:\n >>> t['A'].unique()\n [1,2,3]\n\n list of unique values, grouped by longest combination.\n ```\n >>> g2 = t.groupby(keys=['A', 'B'], functions=[])\n >>> g2['A'][:], g2['B'][:]\n ([1,1,2,2,3,3], [1,2,3,4,5,6])\n ```\n alternatively:\n ```\n >>> list(zip(*t.index('A', 'B').keys()))\n [(1,1,2,2,3,3) (1,2,3,4,5,6)]\n ```\n A key (unique values) and count hereof.\n ```\n >>> g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n >>> g3['A'][:], g3['Count(A)'][:]\n ([1,2,3], [4,4,4])\n ```\n alternatively:\n ```\n >>> t['A'].histogram()\n ([1,2,3], [4,4,4])\n ```\n for more exmaples see:\n https://github.com/root-11/tablite/blob/master/tests/test_groupby.py\n\n \"\"\"\n return _groupby(self, keys, functions, tqdm)\n "},{"location":"reference/core/#tablite.core.Table.pivot","title":"tablite.core.Table.pivot(rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None) ","text":"param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as example: t.show()\n+=====+=====+=====+\n| A | B | C |\n| int | int | int |\n+-----+-----+-----+\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n+=====+=====+=====+\n\nt2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\nt2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int| str |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0 | 6|Sum(B) | 2|None |None |\n|1 | 5|Sum(B) | 4|None |None |\n|2 | 4|Sum(B) |None | 6|None |\n|3 | 3|Sum(B) |None | 8|None |\n|4 | 2|Sum(B) |None |None | 10|\n|5 | 1|Sum(B) |None |None | 12|\n+===+===+========+=====+=====+=====+\n Source code in tablite/core.py def pivot(self, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n \"\"\"\n param: rows: column names to keep as rows\n param: columns: column names to keep as columns\n param: functions: aggregation functions from the Groupby class as\n\n example:\n ```\n t.show()\n +=====+=====+=====+\n | A | B | C |\n | int | int | int |\n +-----+-----+-----+\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n +=====+=====+=====+\n\n t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n t2.show()\n +===+===+========+=====+=====+=====+\n | # | C |function|(A=1)|(A=2)|(A=3)|\n |row|int| str |mixed|mixed|mixed|\n +---+---+--------+-----+-----+-----+\n |0 | 6|Sum(B) | 2|None |None |\n |1 | 5|Sum(B) | 4|None |None |\n |2 | 4|Sum(B) |None | 6|None |\n |3 | 3|Sum(B) |None | 8|None |\n |4 | 2|Sum(B) |None |None | 10|\n |5 | 1|Sum(B) |None |None | 12|\n +===+===+========+=====+=====+=====+\n ```\n \"\"\"\n return pivots.pivot(self, rows, columns, functions, values_as_rows, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.merge","title":"tablite.core.Table.merge(left, right, new, criteria) ","text":"takes from LEFT where criteria is True else RIGHT. :param: T: Table :param: criteria: np.array(bool): if True take left column else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name :returns: T Example: >>> c.show()\n+==+====+====+====+====+\n| #| A | B | C | D |\n+--+----+----+----+----+\n| 0| 1| 10| 1| 11|\n| 1| 2| 20| 2| 12|\n| 2| 3|None| 3| 13|\n| 3|None| 40|None|None|\n| 4| 5| 50|None|None|\n| 5|None|None| 6| 16|\n| 6|None|None| 7| 17|\n+==+====+====+====+====+\n\n>>> c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n>>> c.show()\n+==+====+====+====+\n| #| B | D | E |\n+--+----+----+----+\n| 0| 10| 11| 1|\n| 1| 20| 12| 2|\n| 2|None| 13| 3|\n| 3| 40|None|None|\n| 4| 50|None| 5|\n| 5|None| 16| 6|\n| 6|None| 17| 7|\n+==+====+====+====+\n Source code in tablite/core.py def merge(self, left, right, new, criteria):\n \"\"\" takes from LEFT where criteria is True else RIGHT.\n :param: T: Table\n :param: criteria: np.array(bool): \n if True take left column\n else take right column\n :param left: (str) column name\n :param right: (str) column name\n :param new: (str) new name\n\n :returns: T\n\n Example:\n ```\n >>> c.show()\n +==+====+====+====+====+\n | #| A | B | C | D |\n +--+----+----+----+----+\n | 0| 1| 10| 1| 11|\n | 1| 2| 20| 2| 12|\n | 2| 3|None| 3| 13|\n | 3|None| 40|None|None|\n | 4| 5| 50|None|None|\n | 5|None|None| 6| 16|\n | 6|None|None| 7| 17|\n +==+====+====+====+====+\n\n >>> c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n >>> c.show()\n +==+====+====+====+\n | #| B | D | E |\n +--+----+----+----+\n | 0| 10| 11| 1|\n | 1| 20| 12| 2|\n | 2|None| 13| 3|\n | 3| 40|None|None|\n | 4| 50|None| 5|\n | 5|None| 16| 6|\n | 6|None| 17| 7|\n +==+====+====+====+\n ```\n \"\"\"\n return merge.where(self, criteria,left,right,new)\n "},{"location":"reference/core/#tablite.core.Table.column_select","title":"tablite.core.Table.column_select(cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager) ","text":"type-casts columns from a given table to specified type(s) cols list of dicts: (example): cols = [\n {'column':'A', 'type': 'bool'},\n {'column':'B', 'type': 'int', 'allow_empty': True},\n {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n]\n 'column' : column name of the input table that we want to type-cast 'type' : type that we want to type-cast the specified column to 'allow_empty': should we allow empty values (None, str('')) through (Default: False) 'rename' : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None) supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime' if any of the columns is rejected, entire row is rejected tqdm: progressbar constructor TaskManager: TaskManager constructor (TABLE, TABLE) DESCRIPTION first table contains the rows that were successfully cast to desired types second table contains rows that failed to cast + rejection reason Source code in tablite/core.py def column_select(self, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager):\n \"\"\"\n type-casts columns from a given table to specified type(s)\n\n cols:\n list of dicts: (example):\n\n cols = [\n {'column':'A', 'type': 'bool'},\n {'column':'B', 'type': 'int', 'allow_empty': True},\n {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n ]\n\n 'column' : column name of the input table that we want to type-cast\n 'type' : type that we want to type-cast the specified column to\n 'allow_empty': should we allow empty values (None, str('')) through (Default: False)\n 'rename' : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None)\n\n supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime'\n\n if any of the columns is rejected, entire row is rejected\n\n tqdm: progressbar constructor\n TaskManager: TaskManager constructor\n\n returns: (Table, Table)\n first table contains the rows that were successfully cast to desired types\n second table contains rows that failed to cast + rejection reason\n \"\"\"\n return _column_select(self, cols, tqdm, TaskManager)\n "},{"location":"reference/core/#tablite.core.Table.join","title":"tablite.core.Table.join(other, left_keys, right_keys, left_columns=None, right_columns=None, kind='inner', merge_keys=False, tqdm=_tqdm, pbar=None) ","text":"short-cut for all join functions. kind: 'inner', 'left', 'outer', 'cross' Source code in tablite/core.py def join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, kind=\"inner\", merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n short-cut for all join functions.\n kind: 'inner', 'left', 'outer', 'cross'\n \"\"\"\n kinds = {\n \"inner\": self.inner_join,\n \"left\": self.left_join,\n \"outer\": self.outer_join,\n \"cross\": self.cross_join,\n }\n if kind not in kinds:\n raise ValueError(f\"join type unknown: {kind}\")\n f = kinds.get(kind, None)\n return f(other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.left_join","title":"tablite.core.Table.left_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":":param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example: SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nTablite: left_join = numbers.left_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n)\n Source code in tablite/core.py def left_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n :param other: self, other = (left, right)\n :param left_keys: list of keys for the join\n :param right_keys: list of keys for the join\n :param left_columns: list of left columns to retain, if None, all are retained.\n :param right_columns: list of right columns to retain, if None, all are retained.\n :return: new Table\n Example:\n ```\n SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n Tablite: left_join = numbers.left_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n ```\n \"\"\"\n return joins.left_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.inner_join","title":"tablite.core.Table.inner_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":":param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example: SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\nTablite: inner_join = numbers.inner_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n Source code in tablite/core.py def inner_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n :param other: self, other = (left, right)\n :param left_keys: list of keys for the join\n :param right_keys: list of keys for the join\n :param left_columns: list of left columns to retain, if None, all are retained.\n :param right_columns: list of right columns to retain, if None, all are retained.\n :return: new Table\n Example:\n ```\n SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n Tablite: inner_join = numbers.inner_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n ```\n \"\"\"\n return joins.inner_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.outer_join","title":"tablite.core.Table.outer_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":":param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example: SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nTablite: outer_join = numbers.outer_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n Source code in tablite/core.py def outer_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n :param other: self, other = (left, right)\n :param left_keys: list of keys for the join\n :param right_keys: list of keys for the join\n :param left_columns: list of left columns to retain, if None, all are retained.\n :param right_columns: list of right columns to retain, if None, all are retained.\n :return: new Table\n Example:\n ```\n SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n Tablite: outer_join = numbers.outer_join(\n letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n )\n ```\n \"\"\"\n return joins.outer_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.cross_join","title":"tablite.core.Table.cross_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None) ","text":"CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table Source code in tablite/core.py def cross_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n \"\"\"\n CROSS JOIN returns the Cartesian product of rows from tables in the join.\n In other words, it will produce rows which combine each row from the first table\n with each row from the second table\n \"\"\"\n return joins.cross_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/core/#tablite.core.Table.lookup","title":"tablite.core.Table.lookup(other, *criteria, all=True, tqdm=_tqdm) ","text":"function for looking up values in other according to criteria in ascending order. :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=Any OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare. Examples: ('column A', \"==\", 'column B') # comparison of two columns\n('Date', \"<\", DataTypes.date(24,12) ) # value from column 'Date' is before 24/12.\nf = lambda L,R: all( ord(L) < ord(R) ) # uses custom function.\n('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n Source code in tablite/core.py def lookup(self, other, *criteria, all=True, tqdm=_tqdm):\n \"\"\"function for looking up values in `other` according to criteria in ascending order.\n :param: other: Table sorted in ascending search order.\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n (LEFT, OPERATOR, RIGHT)\n :param: all: boolean: True=ALL, False=Any\n\n OPERATOR must be a callable that returns a boolean\n LEFT must be a value that the OPERATOR can compare.\n RIGHT must be a value that the OPERATOR can compare.\n\n Examples:\n ```\n ('column A', \"==\", 'column B') # comparison of two columns\n ('Date', \"<\", DataTypes.date(24,12) ) # value from column 'Date' is before 24/12.\n f = lambda L,R: all( ord(L) < ord(R) ) # uses custom function.\n ('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n ```\n \"\"\"\n return lookup.lookup(self, other, *criteria, all=all, tqdm=tqdm)\n "},{"location":"reference/core/#tablite.core.Table.match","title":"tablite.core.Table.match(other, *criteria, keep_left=None, keep_right=None) ","text":"performs inner join where T matches other and removes rows that do not match. :param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n :param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep. Source code in tablite/core.py def match(self, other, *criteria, keep_left=None, keep_right=None):\n \"\"\"\n performs inner join where `T` matches `other` and removes rows that do not match.\n\n :param: T: Table\n :param: other: Table\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n Example:\n ('column A', \"==\", 'column B')\n\n This syntax follows the lookup syntax. See Lookup for details.\n\n :param: keep_left: list of columns to keep.\n :param: keep_right: list of right columns to keep.\n \"\"\"\n return match.match(self, other, *criteria, keep_left=keep_left, keep_right=keep_right)\n "},{"location":"reference/core/#tablite.core.Table.replace_missing_values","title":"tablite.core.Table.replace_missing_values(*args, **kwargs) ","text":"Source code in tablite/core.py def replace_missing_values(self, *args, **kwargs):\n raise AttributeError(\"See imputation\")\n "},{"location":"reference/core/#tablite.core.Table.imputation","title":"tablite.core.Table.imputation(targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm) ","text":"In statistics, imputation is the process of replacing missing data with substituted values. See more: https://en.wikipedia.org/wiki/Imputation_(statistics) PARAMETER DESCRIPTION table source table. TYPE: Table targets column names to find and replace missing values TYPE: str or list of strings missing values to be replaced. TYPE: None or iterable DEFAULT: None method method to be used for replacement. Options: 'carry forward': takes the previous value, and carries forward into fields where values are missing. +: quick. Realistic on time series. -: Can produce strange outliers. 'mean': calculates the column mean (exclude missing ) and copies the mean in as replacement. +: quick -: doesn't work on text. Causes data set to drift towards the mean. 'mode': calculates the column mode (exclude missing ) and copies the mean in as replacement. +: quick -: most frequent value becomes over-represented in the sample 'nearest neighbour': calculates normalised distance between items in source columns selects nearest neighbour and copies value as replacement. +: works for any datatype. -: computationally intensive (e.g. slow) TYPE: str DEFAULT: 'carry forward' sources NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used. TYPE: list of strings DEFAULT: None RETURNS DESCRIPTION table table with replaced values. Source code in tablite/core.py def imputation(self, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm):\n \"\"\"\n In statistics, imputation is the process of replacing missing data with substituted values.\n\n See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n Args:\n table (Table): source table.\n\n targets (str or list of strings): column names to find and\n replace missing values\n\n missing (None or iterable): values to be replaced.\n\n method (str): method to be used for replacement. Options:\n\n 'carry forward':\n takes the previous value, and carries forward into fields\n where values are missing.\n +: quick. Realistic on time series.\n -: Can produce strange outliers.\n\n 'mean':\n calculates the column mean (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: doesn't work on text. Causes data set to drift towards the mean.\n\n 'mode':\n calculates the column mode (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: most frequent value becomes over-represented in the sample\n\n 'nearest neighbour':\n calculates normalised distance between items in source columns\n selects nearest neighbour and copies value as replacement.\n +: works for any datatype.\n -: computationally intensive (e.g. slow)\n\n sources (list of strings): NEAREST NEIGHBOUR ONLY\n column names to be used during imputation.\n if None or empty, all columns will be used.\n\n Returns:\n table: table with replaced values.\n \"\"\"\n return imputation.imputation(self, targets, missing, method, sources, tqdm=tqdm)\n "},{"location":"reference/core/#tablite.core.Table.transpose","title":"tablite.core.Table.transpose(tqdm=_tqdm) ","text":"Source code in tablite/core.py def transpose(self, tqdm=_tqdm):\n return pivots.transpose(self, tqdm)\n "},{"location":"reference/core/#tablite.core.Table.pivot_transpose","title":"tablite.core.Table.pivot_transpose(columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm) ","text":"Transpose a selection of columns to rows. PARAMETER DESCRIPTION columns column names to transpose TYPE: list of column names keep column names to keep (repeat) TYPE: list of column names DEFAULT: None RETURNS DESCRIPTION Table with columns transposed to rows Example transpose columns 1,2 and 3 and transpose the remaining columns, except sum . Input: | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n| 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n| ... | | | | | | | | |\n\nt.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun | 456|\n|1234| 2345| 3456| mon | 567|\n|1244| 2445| 4456| mon | 7|\n Source code in tablite/core.py def pivot_transpose(self, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n \"\"\"Transpose a selection of columns to rows.\n\n Args:\n columns (list of column names): column names to transpose\n keep (list of column names): column names to keep (repeat)\n\n Returns:\n Table: with columns transposed to rows\n\n Example:\n transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n Input:\n ```\n | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n |------|------|------|-----|-----|-----|-----|-----|------|\n | 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n | 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n | ... | | | | | | | | |\n\n t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n Output:\n\n |col1| col2| col3| transpose| value|\n |----|-----|-----|----------|------|\n |1234| 2345| 3456| sun | 456|\n |1234| 2345| 3456| mon | 567|\n |1244| 2445| 4456| mon | 7|\n ```\n \"\"\"\n return pivots.pivot_transpose(self, columns, keep, column_name, value_name, tqdm=tqdm)\n "},{"location":"reference/core/#tablite.core.Table.diff","title":"tablite.core.Table.diff(other, columns=None) ","text":"compares table self with table other PARAMETER DESCRIPTION self Table TYPE: Table other Table TYPE: Table columns list of column names to include in comparison. Defaults to None. TYPE: List DEFAULT: None RETURNS DESCRIPTION Table diff of self and other with diff in columns 1st and 2nd. Source code in tablite/core.py def diff(self, other, columns=None):\n \"\"\"compares table self with table other\n\n Args:\n self (Table): Table\n other (Table): Table\n columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n Returns:\n Table: diff of self and other with diff in columns 1st and 2nd.\n \"\"\"\n return diff.diff(self, other, columns)\n "},{"location":"reference/core/#tablite.core-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core-modules","title":"Modules","text":""},{"location":"reference/datasets/","title":"Datasets","text":""},{"location":"reference/datasets/#tablite.datasets","title":"tablite.datasets ","text":""},{"location":"reference/datasets/#tablite.datasets-classes","title":"Classes","text":""},{"location":"reference/datasets/#tablite.datasets-functions","title":"Functions","text":""},{"location":"reference/datasets/#tablite.datasets.synthetic_order_data","title":"tablite.datasets.synthetic_order_data(rows=100000) ","text":"Creates a synthetic dataset for testing that looks like this: (depending on number of rows) +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n| ~ | # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |\n| row | int | int | datetime | int |int| int |str |str|mixed|mixed| float | float |\n+---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n|0 | 1|1478158906743|2021-10-27 00:00:00|50764| 1|29990|C4-5|APP|21\u00b0 |None | 2.0434376837650046|1.3371665497020444|\n|1 | 2|2271295805011|2021-09-13 00:00:00|50141| 0|10212|C4-5|TAE|None |None | 1.010318612835485| 20.94821610676901|\n|2 | 3|1598726492913|2021-08-19 00:00:00|50527| 0|19416|C3-5|QPV|21\u00b0 |None | 1.463459515469516| 17.4133659842749|\n|3 | 4|1413615572689|2021-11-05 00:00:00|50181| 1|18637|C4-2|GCL|6\u00b0 |ABC | 2.084002469706324| 0.489481411683505|\n|4 | 5| 245266998048|2021-09-25 00:00:00|50378| 0|29756|C5-4|LGY|6\u00b0 |XYZ | 0.5141579343276079| 8.550780816571438|\n|5 | 6| 947994853644|2021-10-14 00:00:00|50511| 0| 7890|C2-4|BET|0\u00b0 |XYZ | 1.1725893606177542| 7.447314130260951|\n|6 | 7|2230693047809|2021-10-07 00:00:00|50987| 1|26742|C1-3|CFP|0\u00b0 |XYZ | 1.0921267279498004|11.009210185311993|\n|... |... |... |... |... |...|... |... |...|... |... |... |... |\n|7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883| 1|15687|C3-1|RFR|None |XYZ | 1.3467185981566827|17.023443485654845|\n|7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152| 0|16556|C4-2|WTC|None |ABC | 1.1517593924478968| 8.201818634721487|\n|7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008| 1|14590|C1-3|WYM|0\u00b0 |None | 2.1273836233717978|23.295943554889195|\n|7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173| 1|20744|C2-1|ZYO|6\u00b0 |ABC | 2.482509134693724| 22.25375464857266|\n|7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052| 1|28298|C5-4|XAW|None |XYZ |0.17923757926558143|23.728160892974252|\n|7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993| 1|24832|C5-2|UDL|None |ABC |0.08425329763360942|12.707735293126758|\n|7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510| 0|15819|C3-4|IGY|None |ABC | 1.066241687256579|13.862069804070295|\n+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n PARAMETER DESCRIPTION rows number of rows wanted. Defaults to 100_000. TYPE: int DEFAULT: 100000 RETURNS DESCRIPTION Table Populated table. TYPE: Table Source code in tablite/datasets.py def synthetic_order_data(rows=100_000):\n \"\"\"Creates a synthetic dataset for testing that looks like this:\n (depending on number of rows)\n\n ```\n +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n | ~ | # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |\n | row | int | int | datetime | int |int| int |str |str|mixed|mixed| float | float |\n +---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n |0 | 1|1478158906743|2021-10-27 00:00:00|50764| 1|29990|C4-5|APP|21\u00b0 |None | 2.0434376837650046|1.3371665497020444|\n |1 | 2|2271295805011|2021-09-13 00:00:00|50141| 0|10212|C4-5|TAE|None |None | 1.010318612835485| 20.94821610676901|\n |2 | 3|1598726492913|2021-08-19 00:00:00|50527| 0|19416|C3-5|QPV|21\u00b0 |None | 1.463459515469516| 17.4133659842749|\n |3 | 4|1413615572689|2021-11-05 00:00:00|50181| 1|18637|C4-2|GCL|6\u00b0 |ABC | 2.084002469706324| 0.489481411683505|\n |4 | 5| 245266998048|2021-09-25 00:00:00|50378| 0|29756|C5-4|LGY|6\u00b0 |XYZ | 0.5141579343276079| 8.550780816571438|\n |5 | 6| 947994853644|2021-10-14 00:00:00|50511| 0| 7890|C2-4|BET|0\u00b0 |XYZ | 1.1725893606177542| 7.447314130260951|\n |6 | 7|2230693047809|2021-10-07 00:00:00|50987| 1|26742|C1-3|CFP|0\u00b0 |XYZ | 1.0921267279498004|11.009210185311993|\n |... |... |... |... |... |...|... |... |...|... |... |... |... |\n |7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883| 1|15687|C3-1|RFR|None |XYZ | 1.3467185981566827|17.023443485654845|\n |7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152| 0|16556|C4-2|WTC|None |ABC | 1.1517593924478968| 8.201818634721487|\n |7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008| 1|14590|C1-3|WYM|0\u00b0 |None | 2.1273836233717978|23.295943554889195|\n |7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173| 1|20744|C2-1|ZYO|6\u00b0 |ABC | 2.482509134693724| 22.25375464857266|\n |7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052| 1|28298|C5-4|XAW|None |XYZ |0.17923757926558143|23.728160892974252|\n |7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993| 1|24832|C5-2|UDL|None |ABC |0.08425329763360942|12.707735293126758|\n |7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510| 0|15819|C3-4|IGY|None |ABC | 1.066241687256579|13.862069804070295|\n +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n ```\n\n Args:\n rows (int, optional): number of rows wanted. Defaults to 100_000.\n\n Returns:\n Table (Table): Populated table.\n \"\"\" # noqa\n rows = int(rows)\n\n L1 = [\"None\", \"0\u00b0\", \"6\u00b0\", \"21\u00b0\"]\n L2 = [\"ABC\", \"XYZ\", \"\"]\n\n t = Table()\n assert isinstance(t, Table)\n for page_n in range(math.ceil(rows / Config.PAGE_SIZE)): # n pages\n start = (page_n * Config.PAGE_SIZE)\n end = min(start + Config.PAGE_SIZE, rows)\n ro = range(start, end)\n\n t2 = Table()\n t2[\"#\"] = [v+1 for v in ro]\n # 1 - mock orderid\n t2[\"1\"] = [random.randint(18_778_628_504, 2277_772_117_504) for i in ro]\n # 2 - mock delivery date.\n t2[\"2\"] = [datetime.fromordinal(random.randint(738000, 738150)).isoformat() for i in ro]\n # 3 - mock store id.\n t2[\"3\"] = [random.randint(50000, 51000) for _ in ro]\n # 4 - random bit.\n t2[\"4\"] = [random.randint(0, 1) for _ in ro]\n # 5 - mock product id\n t2[\"5\"] = [random.randint(3000, 30000) for _ in ro]\n # 6 - random weird string\n t2[\"6\"] = [f\"C{random.randint(1, 5)}-{random.randint(1, 5)}\" for _ in ro]\n # 7 - # random category\n t2[\"7\"] = [\"\".join(random.choice(ascii_uppercase) for _ in range(3)) for _ in ro]\n # 8 -random temperature group.\n t2[\"8\"] = [random.choice(L1) for _ in ro]\n # 9 - random choice of category\n t2[\"9\"] = [random.choice(L2) for _ in ro]\n # 10 - volume?\n t2[\"10\"] = [random.uniform(0.01, 2.5) for _ in ro]\n # 11 - units?\n t2[\"11\"] = [f\"{random.uniform(0.1, 25)}\" for _ in ro]\n\n if len(t) == 0:\n t = t2\n else:\n t += t2\n\n return t\n "},{"location":"reference/datatypes/","title":"Datatypes","text":""},{"location":"reference/datatypes/#tablite.datatypes","title":"tablite.datatypes ","text":""},{"location":"reference/datatypes/#tablite.datatypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.matched_types","title":"tablite.datatypes.matched_types = {int: DataTypes._infer_int, str: DataTypes._infer_str, float: DataTypes._infer_float, bool: DataTypes._infer_bool, date: DataTypes._infer_date, datetime: DataTypes._infer_datetime, time: DataTypes._infer_time} module-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes-classes","title":"Classes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes","title":"tablite.datatypes.DataTypes ","text":" Bases: object DataTypes is the conversion library for all datatypes. It supports any / all python datatypes. "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.int","title":"tablite.datatypes.DataTypes.int = int class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.str","title":"tablite.datatypes.DataTypes.str = str class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.float","title":"tablite.datatypes.DataTypes.float = float class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bool","title":"tablite.datatypes.DataTypes.bool = bool class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date","title":"tablite.datatypes.DataTypes.date = date class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime","title":"tablite.datatypes.DataTypes.datetime = datetime class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.time","title":"tablite.datatypes.DataTypes.time = time class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.timedelta","title":"tablite.datatypes.DataTypes.timedelta = timedelta class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.numeric_types","title":"tablite.datatypes.DataTypes.numeric_types = {int, float, date, time, datetime} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch","title":"tablite.datatypes.DataTypes.epoch = datetime(2000, 1, 1, 0, 0, 0, 0, timezone.utc) class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch_no_tz","title":"tablite.datatypes.DataTypes.epoch_no_tz = datetime(2000, 1, 1, 0, 0, 0, 0) class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.digits","title":"tablite.datatypes.DataTypes.digits = '1234567890' class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.decimals","title":"tablite.datatypes.DataTypes.decimals = set('1234567890-+eE.') class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.integers","title":"tablite.datatypes.DataTypes.integers = set('1234567890-+') class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.nones","title":"tablite.datatypes.DataTypes.nones = {'null', 'Null', 'NULL', '#N/A', '#n/a', '', 'None', None, np.nan} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.none_type","title":"tablite.datatypes.DataTypes.none_type = type(None) class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bytes_functions","title":"tablite.datatypes.DataTypes.bytes_functions = {type(None): b_none, bool: b_bool, int: b_int, float: b_float, str: b_str, bytes: b_bytes, datetime: b_datetime, date: b_date, time: b_time, timedelta: b_timedelta} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code_functions","title":"tablite.datatypes.DataTypes.type_code_functions = {1: _none, 2: _bool, 3: _int, 4: _float, 5: _str, 6: _bytes, 7: _datetime, 8: _date, 9: _time, 10: _timedelta, 11: _unpickle} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pytype_from_type_code","title":"tablite.datatypes.DataTypes.pytype_from_type_code = {1: type(None), 2: bool, 3: int, 4: float, 5: str, 6: bytes, 7: datetime, 8: date, 9: time, 10: timedelta, 11: 'pickled object'} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date_formats","title":"tablite.datatypes.DataTypes.date_formats = {'NNNN-NN-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-NN-N': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-N': lambda x: date(*int(i) for i in x.split('-')), 'NN-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NN-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NNNN.NN.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.NN.N': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.N': lambda x: date(*int(i) for i in x.split('.')), 'NN.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NN.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NNNN/NN/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/NN/N': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/N': lambda x: date(*int(i) for i in x.split('/')), 'NN/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NN/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NNNN NN NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN NN N': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N N': lambda x: date(*int(i) for i in x.split(' ')), 'NN NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NN N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NNNNNNNN': lambda x: date(*(int(x[:4]), int(x[4:6]), int(x[6:])))} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime_formats","title":"tablite.datatypes.DataTypes.datetime_formats = {'NNNN-NN-NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN-NN-NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN/NN/NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN/NN/NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN NN NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN NN NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN.NN.NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NNNN.NN.NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NN-NN-NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN/NN/NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN/NN/NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN NN NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN.NN.NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NNNNNNNNTNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, compact=3)} class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.types","title":"tablite.datatypes.DataTypes.types = [datetime, date, time, int, bool, float, str] class-attribute instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code","title":"tablite.datatypes.DataTypes.type_code(value) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef type_code(cls, value):\n if type(value) in cls._type_codes:\n return cls._type_codes[type(value)]\n elif hasattr(value, \"dtype\"):\n dtype = pytype(value)\n return cls._type_codes[dtype]\n else:\n return cls._type_codes[\"pickle\"]\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_none","title":"tablite.datatypes.DataTypes.b_none(v) ","text":"Source code in tablite/datatypes.py def b_none(v):\n return b\"None\"\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bool","title":"tablite.datatypes.DataTypes.b_bool(v) ","text":"Source code in tablite/datatypes.py def b_bool(v):\n return bytes(str(v), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_int","title":"tablite.datatypes.DataTypes.b_int(v) ","text":"Source code in tablite/datatypes.py def b_int(v):\n return bytes(str(v), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_float","title":"tablite.datatypes.DataTypes.b_float(v) ","text":"Source code in tablite/datatypes.py def b_float(v):\n return bytes(str(v), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_str","title":"tablite.datatypes.DataTypes.b_str(v) ","text":"Source code in tablite/datatypes.py def b_str(v):\n return v.encode(\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bytes","title":"tablite.datatypes.DataTypes.b_bytes(v) ","text":"Source code in tablite/datatypes.py def b_bytes(v):\n return v\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_datetime","title":"tablite.datatypes.DataTypes.b_datetime(v) ","text":"Source code in tablite/datatypes.py def b_datetime(v):\n return bytes(v.isoformat(), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_date","title":"tablite.datatypes.DataTypes.b_date(v) ","text":"Source code in tablite/datatypes.py def b_date(v):\n return bytes(v.isoformat(), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_time","title":"tablite.datatypes.DataTypes.b_time(v) ","text":"Source code in tablite/datatypes.py def b_time(v):\n return bytes(v.isoformat(), encoding=\"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_timedelta","title":"tablite.datatypes.DataTypes.b_timedelta(v) ","text":"Source code in tablite/datatypes.py def b_timedelta(v):\n return bytes(str(float(v.days + (v.seconds / (24 * 60 * 60)))), \"utf-8\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_pickle","title":"tablite.datatypes.DataTypes.b_pickle(v) ","text":"Source code in tablite/datatypes.py def b_pickle(v):\n return pickle.dumps(v, protocol=0)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_bytes","title":"tablite.datatypes.DataTypes.to_bytes(v) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef to_bytes(cls, v):\n if type(v) in cls.bytes_functions: # it's a python native type\n f = cls.bytes_functions[type(v)]\n elif hasattr(v, \"dtype\"): # it's a numpy/c type.\n dtype = pytype(v)\n f = cls.bytes_functions[dtype]\n else:\n f = cls.b_pickle\n return f(v)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_type_code","title":"tablite.datatypes.DataTypes.from_type_code(value, code) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef from_type_code(cls, value, code):\n f = cls.type_code_functions[code]\n return f(value)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pattern_to_datetime","title":"tablite.datatypes.DataTypes.pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False) staticmethod ","text":"Source code in tablite/datatypes.py @staticmethod\ndef pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False):\n assert isinstance(iso_string, str)\n if compact:\n s = iso_string\n if compact == 1: # has T\n slices = [\n (0, 4, \"-\"),\n (4, 6, \"-\"),\n (6, 8, \"T\"),\n (9, 11, \":\"),\n (11, 13, \":\"),\n (13, len(s), \"\"),\n ]\n elif compact == 2: # has no T.\n slices = [\n (0, 4, \"-\"),\n (4, 6, \"-\"),\n (6, 8, \"T\"),\n (8, 10, \":\"),\n (10, 12, \":\"),\n (12, len(s), \"\"),\n ]\n elif compact == 3: # has T and :\n slices = [\n (0, 4, \"-\"),\n (4, 6, \"-\"),\n (6, 8, \"T\"),\n (9, 11, \":\"),\n (12, 14, \":\"),\n (15, len(s), \"\"),\n ]\n else:\n raise TypeError\n iso_string = \"\".join([s[a:b] + c for a, b, c in slices if b <= len(s)])\n iso_string = iso_string.rstrip(\":\")\n\n if day_first:\n s = iso_string\n iso_string = \"\".join((s[6:10], \"-\", s[3:5], \"-\", s[0:2], s[10:]))\n\n if \",\" in iso_string:\n iso_string = iso_string.replace(\",\", \".\")\n\n dot = iso_string[::-1].find(\".\")\n if 0 < dot < 10:\n ix = len(iso_string) - dot\n microsecond = int(float(f\"0{iso_string[ix - 1:]}\") * 10**6)\n # fmt:off\n iso_string = iso_string[: len(iso_string) - dot] + str(microsecond).rjust(6, \"0\")\n # fmt:on\n if ymd:\n iso_string = iso_string.replace(ymd, \"-\", 2)\n if T:\n iso_string = iso_string.replace(T, \"T\")\n return datetime.fromisoformat(iso_string)\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.round","title":"tablite.datatypes.DataTypes.round(value, multiple, up=None) classmethod ","text":"a nicer way to round numbers. PARAMETER DESCRIPTION value value to be rounded TYPE: (float, integer, datetime) multiple value to be used as the based of rounding. 1) multiple = 1 is the same as rounding to whole integers. 2) multiple = 0.001 is the same as rounding to 3 digits precision. 3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415 4) value = datetime(2022,8,18,11,14,53,440) 5) multiple = timedelta(hours=0.5) 6) xround(value,multiple) is datetime(2022,8,18,11,0) TYPE: (float, integer, timedelta) up None (default) or boolean rounds half, up or down. round(1.6, 1) rounds to 2. round(1.4, 1) rounds to 1. round(1.5, 1, up=True) rounds to 2. round(1.5, 1, up=False) rounds to 1. TYPE: (None, bool) DEFAULT: None RETURNS DESCRIPTION float,integer,datetime: rounded value in same type as input. Source code in tablite/datatypes.py @classmethod\ndef round(cls, value, multiple, up=None):\n \"\"\"a nicer way to round numbers.\n\n Args:\n value (float,integer,datetime): value to be rounded\n\n multiple (float,integer,timedelta): value to be used as the based of rounding.\n 1) multiple = 1 is the same as rounding to whole integers.\n 2) multiple = 0.001 is the same as rounding to 3 digits precision.\n 3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415\n 4) value = datetime(2022,8,18,11,14,53,440)\n 5) multiple = timedelta(hours=0.5)\n 6) xround(value,multiple) is datetime(2022,8,18,11,0)\n\n up (None, bool, optional):\n None (default) or boolean rounds half, up or down.\n round(1.6, 1) rounds to 2.\n round(1.4, 1) rounds to 1.\n round(1.5, 1, up=True) rounds to 2.\n round(1.5, 1, up=False) rounds to 1.\n\n Returns:\n float,integer,datetime: rounded value in same type as input.\n \"\"\"\n epoch = 0\n if isinstance(value, (datetime)) and isinstance(multiple, timedelta):\n if value.tzinfo is None:\n epoch = cls.epoch_no_tz\n else:\n epoch = cls.epoch\n\n value2 = value - epoch\n if value2 == 0:\n return value2\n\n low = (value2 // multiple) * multiple\n high = low + multiple\n if up is True:\n return high + epoch\n elif up is False:\n return low + epoch\n else:\n if abs((high + epoch) - value) < abs(value - (low + epoch)):\n return high + epoch\n else:\n return low + epoch\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_json","title":"tablite.datatypes.DataTypes.to_json(v) staticmethod ","text":"converts any python type to json. PARAMETER DESCRIPTION v value to convert to json TYPE: any RETURNS DESCRIPTION json compatible value from v Source code in tablite/datatypes.py @staticmethod\ndef to_json(v):\n \"\"\"converts any python type to json.\n\n Args:\n v (any): value to convert to json\n\n Returns:\n json compatible value from v\n \"\"\"\n if hasattr(v, \"dtype\"):\n v = numpy_to_python(v)\n if v is None:\n return v\n elif v is False:\n # using isinstance(v, bool): won't work as False also is int of zero.\n return str(v)\n elif v is True:\n return str(v)\n elif isinstance(v, int):\n return v\n elif isinstance(v, str):\n return v\n elif isinstance(v, float):\n return v\n elif isinstance(v, datetime):\n return v.isoformat()\n elif isinstance(v, time):\n return v.isoformat()\n elif isinstance(v, date):\n return v.isoformat()\n elif isinstance(v, timedelta):\n return f\"P{v.days}DT{v.seconds + (v.microseconds / 1e6)}S\"\n else:\n raise TypeError(f\"The datatype {type(v)} is not supported.\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_json","title":"tablite.datatypes.DataTypes.from_json(v, dtype) staticmethod ","text":"converts json to python datatype PARAMETER DESCRIPTION v value TYPE: any dtype any python type TYPE: python type RETURNS DESCRIPTION python type of value v Source code in tablite/datatypes.py @staticmethod\ndef from_json(v, dtype):\n \"\"\"converts json to python datatype\n\n Args:\n v (any): value\n dtype (python type): any python type\n\n Returns:\n python type of value v\n \"\"\"\n if v in DataTypes.nones:\n if dtype is str and v == \"\":\n return \"\"\n else:\n return None\n if dtype is int:\n return int(v)\n elif dtype is str:\n return str(v)\n elif dtype is float:\n return float(v)\n elif dtype is bool:\n if v == \"False\":\n return False\n elif v == \"True\":\n return True\n else:\n raise ValueError(v)\n elif dtype is date:\n return date.fromisoformat(v)\n elif dtype is datetime:\n return datetime.fromisoformat(v)\n elif dtype is time:\n return time.fromisoformat(v)\n elif dtype is timedelta:\n L = v.split(\"DT\")\n days = int(L[0].lstrip(\"P\"))\n seconds = float(L[1].rstrip(\"S\"))\n return timedelta(days, seconds)\n else:\n raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess_types","title":"tablite.datatypes.DataTypes.guess_types(*values) staticmethod ","text":"Attempts to guess the datatype for *values returns dict with matching datatypes and probabilities RETURNS DESCRIPTION dict {key: type, value: probability} Source code in tablite/datatypes.py @staticmethod\ndef guess_types(*values):\n \"\"\"Attempts to guess the datatype for *values\n returns dict with matching datatypes and probabilities\n\n Returns:\n dict: {key: type, value: probability}\n \"\"\"\n d = defaultdict(int)\n probability = Rank(DataTypes.types[:])\n\n for value in values:\n if hasattr(value, \"dtype\"):\n value = numpy_to_python(value)\n\n for dtype in probability:\n try:\n _ = DataTypes.infer(value, dtype)\n d[dtype] += 1\n probability.match(dtype)\n break\n except (ValueError, TypeError):\n pass\n if not d:\n d[str] = len(values)\n return {k: round(v / len(values), 3) for k, v in d.items()}\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess","title":"tablite.datatypes.DataTypes.guess(*values) staticmethod ","text":"Makes a best guess the datatype for *values returns list of native python values RETURNS DESCRIPTION list list of native python values Source code in tablite/datatypes.py @staticmethod\ndef guess(*values):\n \"\"\"Makes a best guess the datatype for *values\n returns list of native python values\n\n Returns:\n list: list of native python values\n \"\"\"\n probability = Rank(*DataTypes.types[:])\n matches = [None for _ in values[0]]\n\n for ix, value in enumerate(values[0]):\n if hasattr(value, \"dtype\"):\n value = numpy_to_python(value)\n for dtype in probability:\n try:\n matches[ix] = DataTypes.infer(value, dtype)\n probability.match(dtype)\n break\n except (ValueError, TypeError):\n pass\n return matches\n "},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.infer","title":"tablite.datatypes.DataTypes.infer(v, dtype) classmethod ","text":"Source code in tablite/datatypes.py @classmethod\ndef infer(cls, v, dtype):\n if isinstance(v, str) and dtype == str:\n # we got a string, we're trying to infer it to string, we shouldn't check for None-ness\n return v\n\n if v in DataTypes.nones:\n return None\n\n if dtype not in matched_types:\n raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n\n return matched_types[dtype](v)\n "},{"location":"reference/datatypes/#tablite.datatypes.Rank","title":"tablite.datatypes.Rank(*items) ","text":" Bases: object Source code in tablite/datatypes.py def __init__(self, *items):\n self.items = {i: ix for i, ix in zip(items, range(len(items)))}\n self.ranks = [0 for _ in items]\n self.items_list = [i for i in items]\n "},{"location":"reference/datatypes/#tablite.datatypes.Rank-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items","title":"tablite.datatypes.Rank.items = {i: ixfor (i, ix) in zip(items, range(len(items)))} instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.ranks","title":"tablite.datatypes.Rank.ranks = [0 for _ in items] instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items_list","title":"tablite.datatypes.Rank.items_list = [i for i in items] instance-attribute ","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.match","title":"tablite.datatypes.Rank.match(k) ","text":"Source code in tablite/datatypes.py def match(self, k): # k+=1\n ix = self.items[k]\n r = self.ranks\n r[ix] += 1\n\n if ix > 0:\n p = self.items_list\n while (\n r[ix] > r[ix - 1] and ix > 0\n ): # use a simple bubble sort to maintain rank\n r[ix], r[ix - 1] = r[ix - 1], r[ix]\n p[ix], p[ix - 1] = p[ix - 1], p[ix]\n old = p[ix]\n self.items[old] = ix\n self.items[k] = ix - 1\n ix -= 1\n "},{"location":"reference/datatypes/#tablite.datatypes.Rank.__iter__","title":"tablite.datatypes.Rank.__iter__() ","text":"Source code in tablite/datatypes.py def __iter__(self):\n return iter(self.items_list)\n "},{"location":"reference/datatypes/#tablite.datatypes.MetaArray","title":"tablite.datatypes.MetaArray ","text":" Bases: ndarray Array with metadata. "},{"location":"reference/datatypes/#tablite.datatypes.MetaArray-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__new__","title":"tablite.datatypes.MetaArray.__new__(array, dtype=None, order=None, **kwargs) ","text":"Source code in tablite/datatypes.py def __new__(cls, array, dtype=None, order=None, **kwargs):\n obj = np.asarray(array, dtype=dtype, order=order).view(cls)\n obj.metadata = kwargs\n return obj\n "},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__array_finalize__","title":"tablite.datatypes.MetaArray.__array_finalize__(obj) ","text":"Source code in tablite/datatypes.py def __array_finalize__(self, obj):\n if obj is None:\n return\n self.metadata = getattr(obj, \"metadata\", None)\n "},{"location":"reference/datatypes/#tablite.datatypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.numpy_to_python","title":"tablite.datatypes.numpy_to_python(obj: Any) -> Any ","text":"Converts numpy types to python types. See https://numpy.org/doc/stable/reference/arrays.scalars.html PARAMETER DESCRIPTION obj A numpy object TYPE: Any RETURNS DESCRIPTION Any python object: A python object Source code in tablite/datatypes.py def numpy_to_python(obj: Any) -> Any:\n \"\"\"Converts numpy types to python types.\n\n See https://numpy.org/doc/stable/reference/arrays.scalars.html\n\n Args:\n obj (Any): A numpy object\n\n Returns:\n python object: A python object\n \"\"\"\n if isinstance(obj, np.generic):\n return obj.item()\n return obj\n "},{"location":"reference/datatypes/#tablite.datatypes.pytype","title":"tablite.datatypes.pytype(obj) ","text":"Returns the python type of any object PARAMETER DESCRIPTION obj any numpy or python object TYPE: Any RETURNS DESCRIPTION type type of obj Source code in tablite/datatypes.py def pytype(obj):\n \"\"\"Returns the python type of any object\n\n Args:\n obj (Any): any numpy or python object\n\n Returns:\n type: type of obj\n \"\"\"\n if isinstance(obj, np.generic):\n return type(obj.item())\n return type(obj)\n "},{"location":"reference/datatypes/#tablite.datatypes.pytype_from_iterable","title":"tablite.datatypes.pytype_from_iterable(iterable: {tuple, list}) -> {np.dtype, dict} ","text":"helper to make correct np array from python types. PARAMETER DESCRIPTION iterable values to be converted to numpy array. TYPE: (tuple, list) RAISES DESCRIPTION NotImplementedError if datatype is not supported. RETURNS DESCRIPTION {dtype, dict} np.dtype: python type of the iterable. Source code in tablite/datatypes.py def pytype_from_iterable(iterable: {tuple, list}) -> {np.dtype, dict}:\n \"\"\"helper to make correct np array from python types.\n\n Args:\n iterable (tuple,list): values to be converted to numpy array.\n\n Raises:\n NotImplementedError: if datatype is not supported.\n\n Returns:\n np.dtype: python type of the iterable.\n \"\"\"\n py_types = {}\n if isinstance(iterable, (tuple, list)):\n type_counter = Counter((pytype(v) for v in iterable))\n\n for k, v in type_counter.items():\n py_types[k] = v\n\n if len(py_types) == 0:\n np_dtype, py_dtype = object, bool\n elif len(py_types) == 1:\n py_dtype = list(py_types.keys())[0]\n if py_dtype == datetime:\n np_dtype = np.datetime64\n elif py_dtype == date:\n np_dtype = np.datetime64\n elif py_dtype == timedelta:\n np_dtype = np.timedelta64\n else:\n np_dtype = None\n else:\n np_dtype = object\n elif isinstance(iterable, np.ndarray):\n if iterable.dtype == object:\n np_dtype = object\n py_types = dict(Counter((pytype(v) for v in iterable)))\n else:\n np_dtype = iterable.dtype\n if len(iterable) > 0:\n py_types = {pytype(iterable[0]): len(iterable)}\n else:\n py_types = {pytype(np_dtype.type()): len(iterable)}\n else:\n raise NotImplementedError(f\"No handler for {type(iterable)}\")\n\n return np_dtype, py_types\n "},{"location":"reference/datatypes/#tablite.datatypes.list_to_np_array","title":"tablite.datatypes.list_to_np_array(iterable) ","text":"helper to make correct np array from python types. Example of problem where numpy turns mixed types into strings. np.array([4, '5']) np.ndarray(['4', '5']) RETURNS DESCRIPTION np.array datatypes Source code in tablite/datatypes.py def list_to_np_array(iterable):\n \"\"\"helper to make correct np array from python types.\n Example of problem where numpy turns mixed types into strings.\n >>> np.array([4, '5'])\n np.ndarray(['4', '5'])\n\n returns:\n np.array\n datatypes\n \"\"\"\n np_dtype, py_dtype = pytype_from_iterable(iterable)\n\n value = MetaArray(iterable, dtype=np_dtype, py_dtype=py_dtype)\n return value\n "},{"location":"reference/datatypes/#tablite.datatypes.np_type_unify","title":"tablite.datatypes.np_type_unify(arrays) ","text":"unifies numpy types. PARAMETER DESCRIPTION arrays List of numpy arrays TYPE: list RETURNS DESCRIPTION np.ndarray: numpy array of a single type. Source code in tablite/datatypes.py def np_type_unify(arrays):\n \"\"\"unifies numpy types.\n\n Args:\n arrays (list): List of numpy arrays\n\n Returns:\n np.ndarray: numpy array of a single type.\n \"\"\"\n dtypes = {arr.dtype: len(arr) for arr in arrays}\n if len(dtypes) == 1:\n dtype, _ = dtypes.popitem()\n else:\n for ix, arr in enumerate(arrays):\n arrays[ix] = np.array(arr, dtype=object)\n dtype = object\n return np.concatenate(arrays, dtype=dtype)\n "},{"location":"reference/datatypes/#tablite.datatypes.multitype_set","title":"tablite.datatypes.multitype_set(arr) ","text":"prevents loss of True, False when calling sets. python looses values when called returning a set. Example: {1, True, 0, False} PARAMETER DESCRIPTION arr iterable of mixed types. TYPE: Iterable RETURNS DESCRIPTION np.array: with unique values. Source code in tablite/datatypes.py def multitype_set(arr):\n \"\"\"prevents loss of True, False when calling sets.\n\n python looses values when called returning a set. Example:\n >>> {1, True, 0, False}\n {0,1}\n\n Args:\n arr (Iterable): iterable of mixed types.\n\n Returns:\n np.array: with unique values.\n \"\"\"\n L = [(type(v), v) for v in arr]\n L = list(set(L))\n L = [v for _, v in L]\n return np.array(L, dtype=object)\n "},{"location":"reference/diff/","title":"Diff","text":""},{"location":"reference/diff/#tablite.diff","title":"tablite.diff ","text":""},{"location":"reference/diff/#tablite.diff-classes","title":"Classes","text":""},{"location":"reference/diff/#tablite.diff-functions","title":"Functions","text":""},{"location":"reference/diff/#tablite.diff.diff","title":"tablite.diff.diff(T, other, columns=None) ","text":"compares table self with table other PARAMETER DESCRIPTION self Table TYPE: Table other Table TYPE: Table columns list of column names to include in comparison. Defaults to None. TYPE: List DEFAULT: None RETURNS DESCRIPTION Table diff of self and other with diff in columns 1st and 2nd. Source code in tablite/diff.py def diff(T, other, columns=None):\n \"\"\"compares table self with table other\n\n Args:\n self (Table): Table\n other (Table): Table\n columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n Returns:\n Table: diff of self and other with diff in columns 1st and 2nd.\n \"\"\"\n sub_cls_check(T, BaseTable)\n sub_cls_check(other, BaseTable)\n if columns is None:\n columns = [name for name in T.columns if name in other.columns]\n elif isinstance(columns, list) and all(isinstance(i, str) for i in columns):\n for name in columns:\n if name not in T.columns:\n raise ValueError(f\"column '{name}' not found\")\n if name not in other.columns:\n raise ValueError(f\"column '{name}' not found\")\n else:\n raise TypeError(\"Expected list of column names\")\n\n t1 = T[columns]\n if issubclass(type(t1), BaseTable):\n t1 = [tuple(r) for r in T.rows]\n else:\n t1 = list(T)\n t2 = other[columns]\n if issubclass(type(t2), BaseTable):\n t2 = [tuple(r) for r in other.rows]\n else:\n t2 = list(other)\n\n sm = difflib.SequenceMatcher(None, t1, t2)\n new = type(T)()\n first = unique_name(\"1st\", columns)\n second = unique_name(\"2nd\", columns)\n new.add_columns(*columns + [first, second])\n\n news = {n: [] for n in new.columns} # Cache for Work in progress.\n\n for opc, t1a, t1b, t2a, t2b in sm.get_opcodes():\n if opc == \"insert\":\n for name, col in zip(columns, zip(*t2[t2a:t2b])):\n news[name].extend(col)\n news[first] += [\"-\"] * (t2b - t2a)\n news[second] += [\"+\"] * (t2b - t2a)\n\n elif opc == \"delete\":\n for name, col in zip(columns, zip(*t1[t1a:t1b])):\n news[name].extend(col)\n news[first] += [\"+\"] * (t1b - t1a)\n news[second] += [\"-\"] * (t1b - t1a)\n\n elif opc == \"equal\":\n for name, col in zip(columns, zip(*t2[t2a:t2b])):\n news[name].extend(col)\n news[first] += [\"=\"] * (t2b - t2a)\n news[second] += [\"=\"] * (t2b - t2a)\n\n elif opc == \"replace\":\n for name, col in zip(columns, zip(*t2[t2a:t2b])):\n news[name].extend(col)\n news[first] += [\"r\"] * (t2b - t2a)\n news[second] += [\"r\"] * (t2b - t2a)\n\n else:\n pass\n\n # Clear cache to free up memory.\n if len(news[first]) > Config.PAGE_SIZE == 0:\n for name, L in news.items():\n new[name].extend(np.array(L))\n L.clear()\n\n for name, L in news.items():\n new[name].extend(np.array(L))\n L.clear()\n return new\n "},{"location":"reference/export_utils/","title":"Export utils","text":""},{"location":"reference/export_utils/#tablite.export_utils","title":"tablite.export_utils ","text":""},{"location":"reference/export_utils/#tablite.export_utils-classes","title":"Classes","text":""},{"location":"reference/export_utils/#tablite.export_utils-functions","title":"Functions","text":""},{"location":"reference/export_utils/#tablite.export_utils.to_sql","title":"tablite.export_utils.to_sql(table, name) ","text":"generates ANSI-92 compliant SQL. PARAMETER DESCRIPTION name name of SQL table. TYPE: str Source code in tablite/export_utils.py def to_sql(table, name):\n \"\"\"\n generates ANSI-92 compliant SQL.\n\n args:\n name (str): name of SQL table.\n \"\"\"\n sub_cls_check(table, BaseTable)\n type_check(name, str)\n\n prefix = name\n name = \"T1\"\n create_table = \"\"\"CREATE TABLE {} ({})\"\"\"\n columns = []\n for name, col in table.columns.items():\n dtype = col.types()\n if len(dtype) == 1:\n dtype, _ = dtype.popitem()\n if dtype is int:\n dtype = \"INTEGER\"\n elif dtype is float:\n dtype = \"REAL\"\n else:\n dtype = \"TEXT\"\n else:\n dtype = \"TEXT\"\n definition = f\"{name} {dtype}\"\n columns.append(definition)\n\n create_table = create_table.format(prefix, \", \".join(columns))\n\n # return create_table\n row_inserts = []\n for row in table.rows:\n row_inserts.append(str(tuple([i if i is not None else \"NULL\" for i in row])))\n row_inserts = f\"INSERT INTO {prefix} VALUES \" + \",\".join(row_inserts)\n return \"begin; {}; {}; commit;\".format(create_table, row_inserts)\n "},{"location":"reference/export_utils/#tablite.export_utils.to_pandas","title":"tablite.export_utils.to_pandas(table) ","text":"returns pandas.DataFrame Source code in tablite/export_utils.py def to_pandas(table):\n \"\"\"\n returns pandas.DataFrame\n \"\"\"\n sub_cls_check(table, BaseTable)\n try:\n return pd.DataFrame(table.to_dict()) # noqa\n except ImportError:\n import pandas as pd # noqa\n return pd.DataFrame(table.to_dict()) # noqa\n "},{"location":"reference/export_utils/#tablite.export_utils.to_hdf5","title":"tablite.export_utils.to_hdf5(table, path) ","text":"creates a copy of the table as hdf5 Note that some loss of type information is to be expected in columns of mixed type: t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ Source code in tablite/export_utils.py def to_hdf5(table, path):\n # fmt: off\n \"\"\"\n creates a copy of the table as hdf5\n\n Note that some loss of type information is to be expected in columns of mixed type:\n >>> t.show(dtype=True)\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int|\n +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n >>> t.to_hdf5(filename)\n >>> t2 = Table.from_hdf5(filename)\n >>> t2.show(dtype=True)\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int|\n +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n \"\"\"\n # fmt: in\n import h5py\n\n sub_cls_check(table, BaseTable)\n type_check(path, Path)\n\n total = f\"{len(table.columns) * len(table):,}\" # noqa\n print(f\"writing {total} records to {path}\", end=\"\")\n\n with h5py.File(path, \"w\") as f:\n n = 0\n for name, col in table.items():\n try:\n f.create_dataset(name, data=col[:]) # stored in hdf5 as '/name'\n except TypeError:\n f.create_dataset(name, data=[str(i) for i in col[:]]) # stored in hdf5 as '/name'\n n += 1\n print(\"... done\")\n "},{"location":"reference/export_utils/#tablite.export_utils.excel_writer","title":"tablite.export_utils.excel_writer(table, path) ","text":"writer for excel files. This can create xlsx files beyond Excels. If you're using pyexcel to read the data, you'll see the data is there. If you're using Excel, Excel will stop loading after 1,048,576 rows. See pyexcel for more details: http://docs.pyexcel.org/ Source code in tablite/export_utils.py def excel_writer(table, path):\n \"\"\"\n writer for excel files.\n\n This can create xlsx files beyond Excels.\n If you're using pyexcel to read the data, you'll see the data is there.\n If you're using Excel, Excel will stop loading after 1,048,576 rows.\n\n See pyexcel for more details:\n http://docs.pyexcel.org/\n \"\"\"\n import pyexcel\n\n sub_cls_check(table, BaseTable)\n type_check(path, Path)\n\n def gen(table): # local helper\n yield table.columns\n for row in table.rows:\n yield row\n\n data = list(gen(table))\n if path.suffix in [\".xls\", \".ods\"]:\n data = [\n [str(v) if (isinstance(v, (int, float)) and abs(v) > 2**32 - 1) else DataTypes.to_json(v) for v in row]\n for row in data\n ]\n\n pyexcel.save_as(array=data, dest_file_name=str(path))\n "},{"location":"reference/export_utils/#tablite.export_utils.to_json","title":"tablite.export_utils.to_json(table, *args, **kwargs) ","text":"Source code in tablite/export_utils.py def to_json(table, *args, **kwargs):\n import json\n\n sub_cls_check(table, BaseTable)\n return json.dumps(table.as_json_serializable())\n "},{"location":"reference/export_utils/#tablite.export_utils.path_suffix_check","title":"tablite.export_utils.path_suffix_check(path, kind) ","text":"Source code in tablite/export_utils.py def path_suffix_check(path, kind):\n if not path.suffix == kind:\n raise ValueError(f\"Suffix mismatch: Expected {kind}, got {path.suffix} in {path.name}\")\n if not path.parent.exists():\n raise FileNotFoundError(f\"directory {path.parent} not found.\")\n "},{"location":"reference/export_utils/#tablite.export_utils.text_writer","title":"tablite.export_utils.text_writer(table, path, tqdm=_tqdm) ","text":"exports table to csv, tsv or txt dependening on path suffix. follows the JSON norm. text escape is ON for all strings. "},{"location":"reference/export_utils/#tablite.export_utils.text_writer--note","title":"Note:","text":"If the delimiter is present in a string when the string is exported, text-escape is required, as the format otherwise is corrupted. When the file is being written, it is unknown whether any string in a column contrains the delimiter. As text escaping the few strings that may contain the delimiter would lead to an assymmetric format, the safer guess is to text escape all strings. Source code in tablite/export_utils.py def text_writer(table, path, tqdm=_tqdm):\n \"\"\"exports table to csv, tsv or txt dependening on path suffix.\n follows the JSON norm. text escape is ON for all strings.\n\n Note:\n ----------------------\n If the delimiter is present in a string when the string is exported,\n text-escape is required, as the format otherwise is corrupted.\n When the file is being written, it is unknown whether any string in\n a column contrains the delimiter. As text escaping the few strings\n that may contain the delimiter would lead to an assymmetric format,\n the safer guess is to text escape all strings.\n \"\"\"\n sub_cls_check(table, BaseTable)\n type_check(path, Path)\n\n def txt(value): # helper for text writer\n if value is None:\n return \"\" # A column with 1,None,2 must be \"1,,2\".\n elif isinstance(value, str):\n # if not (value.startswith('\"') and value.endswith('\"')):\n # return f'\"{value}\"' # this must be escape: \"the quick fox, jumped over the comma\"\n # else:\n return value # this would for example be an empty string: \"\"\n else:\n return str(DataTypes.to_json(value)) # this handles datetimes, timedelta, etc.\n\n delimiters = {\".csv\": \",\", \".tsv\": \"\\t\", \".txt\": \"|\"}\n delimiter = delimiters.get(path.suffix)\n\n with path.open(\"w\", encoding=\"utf-8\") as fo:\n fo.write(delimiter.join(c for c in table.columns) + \"\\n\")\n for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE):\n fo.write(delimiter.join(txt(c) for c in row) + \"\\n\")\n "},{"location":"reference/export_utils/#tablite.export_utils.sql_writer","title":"tablite.export_utils.sql_writer(table, path) ","text":"Source code in tablite/export_utils.py def sql_writer(table, path):\n type_check(table, BaseTable)\n type_check(path, Path)\n with path.open(\"w\", encoding=\"utf-8\") as fo:\n fo.write(to_sql(table))\n "},{"location":"reference/export_utils/#tablite.export_utils.json_writer","title":"tablite.export_utils.json_writer(table, path) ","text":"Source code in tablite/export_utils.py def json_writer(table, path):\n type_check(table, BaseTable)\n type_check(path, Path)\n with path.open(\"w\") as fo:\n fo.write(to_json(table))\n "},{"location":"reference/export_utils/#tablite.export_utils.to_html","title":"tablite.export_utils.to_html(table, path) ","text":"Source code in tablite/export_utils.py def to_html(table, path):\n type_check(table, BaseTable)\n type_check(path, Path)\n with path.open(\"w\", encoding=\"utf-8\") as fo:\n fo.write(table._repr_html_(slice(0, len(table))))\n "},{"location":"reference/file_reader_utils/","title":"File reader utils","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils","title":"tablite.file_reader_utils ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ENCODING_GUESS_BYTES","title":"tablite.file_reader_utils.ENCODING_GUESS_BYTES = 10000 module-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.header_readers","title":"tablite.file_reader_utils.header_readers = {'fods': excel_reader_headers, 'json': excel_reader_headers, 'simple': excel_reader_headers, 'rst': excel_reader_headers, 'mediawiki': excel_reader_headers, 'xlsx': excel_reader_headers, 'xlsm': excel_reader_headers, 'csv': text_reader_headers, 'tsv': text_reader_headers, 'txt': text_reader_headers, 'ods': ods_reader_headers} module-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-classes","title":"Classes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape","title":"tablite.file_reader_utils.TextEscape(openings='({[', closures=']})', text_qualifier='\"', delimiter=',', strip_leading_and_tailing_whitespace=False) ","text":" Bases: object enables parsing of CSV with respecting brackets and text marks. Example: text_escape = TextEscape() # set up the instance. for line in somefile.readlines(): list_of_words = text_escape(line) # use the instance. ... As an example, the Danes and Germans use \" for inches and ' for feet, so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so for this case ( and ) are valid escapes, but \" and ' aren't. Source code in tablite/file_reader_utils.py def __init__(\n self,\n openings=\"({[\",\n closures=\"]})\",\n text_qualifier='\"',\n delimiter=\",\",\n strip_leading_and_tailing_whitespace=False,\n):\n \"\"\"\n As an example, the Danes and Germans use \" for inches and ' for feet,\n so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so\n for this case ( and ) are valid escapes, but \" and ' aren't.\n\n \"\"\"\n if openings is None:\n openings = [None]\n elif isinstance(openings, str):\n self.openings = {c for c in openings}\n else:\n raise TypeError(f\"expected str, got {type(openings)}\")\n\n if closures is None:\n closures = [None]\n elif isinstance(closures, str):\n self.closures = {c for c in closures}\n else:\n raise TypeError(f\"expected str, got {type(closures)}\")\n\n if not isinstance(delimiter, str):\n raise TypeError(f\"expected str, got {type(delimiter)}\")\n self.delimiter = delimiter\n self._delimiter_length = len(delimiter)\n self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n\n if text_qualifier is None:\n pass\n elif text_qualifier in openings + closures:\n raise ValueError(\"It's a bad idea to have qoute character appears in openings or closures.\")\n else:\n self.qoute = text_qualifier\n\n if not text_qualifier:\n if not self.strip_leading_and_tailing_whitespace:\n self.c = self._call_1\n else:\n self.c = self._call_2\n else:\n self.c = self._call_3\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.openings","title":"tablite.file_reader_utils.TextEscape.openings = {c for c in openings} instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.closures","title":"tablite.file_reader_utils.TextEscape.closures = {c for c in closures} instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.delimiter","title":"tablite.file_reader_utils.TextEscape.delimiter = delimiter instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace","title":"tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.qoute","title":"tablite.file_reader_utils.TextEscape.qoute = text_qualifier instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.c","title":"tablite.file_reader_utils.TextEscape.c = self._call_1 instance-attribute ","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.__call__","title":"tablite.file_reader_utils.TextEscape.__call__(s) ","text":"Source code in tablite/file_reader_utils.py def __call__(self, s):\n return self.c(s)\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.split_by_sequence","title":"tablite.file_reader_utils.split_by_sequence(text, sequence) ","text":"helper to split text according to a split sequence. Source code in tablite/file_reader_utils.py def split_by_sequence(text, sequence):\n \"\"\"helper to split text according to a split sequence.\"\"\"\n chunks = tuple()\n for element in sequence:\n idx = text.find(element)\n if idx < 0:\n raise ValueError(f\"'{element}' not in row\")\n chunk, text = text[:idx], text[len(element) + idx :]\n chunks += (chunk,)\n chunks += (text,) # the remaining text.\n return chunks\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.detect_seperator","title":"tablite.file_reader_utils.detect_seperator(text) ","text":":param path: pathlib.Path objects :param encoding: file encoding. :return: 1 character. Source code in tablite/file_reader_utils.py def detect_seperator(text):\n \"\"\"\n :param path: pathlib.Path objects\n :param encoding: file encoding.\n :return: 1 character.\n \"\"\"\n # After reviewing the logic in the CSV sniffer, I concluded that all it\n # really does is to look for a non-text character. As the separator is\n # determined by the first line, which almost always is a line of headers,\n # the text characters will be utf-8,16 or ascii letters plus white space.\n # This leaves the characters ,;:| and \\t as potential separators, with one\n # exception: files that use whitespace as separator. My logic is therefore\n # to (1) find the set of characters that intersect with ',;:|\\t' which in\n # practice is a single character, unless (2) it is empty whereby it must\n # be whitespace.\n if len(text) == 0:\n return None\n seps = {\",\", \"\\t\", \";\", \":\", \"|\"}.intersection(text)\n if not seps:\n if \" \" in text:\n return \" \"\n if \"\\n\" in text:\n return \"\\n\"\n else:\n raise ValueError(\"separator not detected\")\n if len(seps) == 1:\n return seps.pop()\n else:\n frq = [(text.count(i), i) for i in seps]\n frq.sort(reverse=True) # most frequent first.\n return frq[0][-1]\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.text_reader_headers","title":"tablite.file_reader_utils.text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount) ","text":"Source code in tablite/file_reader_utils.py def text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n d = {}\n delimiters = {\n \".csv\": \",\",\n \".tsv\": \"\\t\",\n \".txt\": None,\n }\n\n try:\n with path.open(\"rb\") as fi:\n rawdata = fi.read(ENCODING_GUESS_BYTES)\n encoding = chardet.detect(rawdata)[\"encoding\"]\n\n if delimiter is None:\n with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n lines = []\n for n, line in enumerate(fi, -header_row_index):\n if n < 0:\n continue\n line = line.rstrip(\"\\n\")\n lines.append(line)\n if n >= linecount:\n break # break on first\n try:\n d[\"delimiter\"] = delimiter = detect_seperator(\"\\n\".join(lines))\n except ValueError as e:\n if e.args == (\"separator not detected\", ):\n d[\"delimiter\"] = delimiter = None # this will handle the case of 1 column, 1 row\n else:\n raise e\n\n if delimiter is None:\n d[\"delimiter\"] = delimiter = delimiters[path.suffix] # pickup the default one\n d[path.name] = [lines]\n d[\"is_empty\"] = True # mark as empty to return an empty table instead of throwing\n else:\n kwargs = {}\n\n if text_qualifier is not None:\n kwargs[\"text_qualifier\"] = text_qualifier\n kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n else:\n kwargs[\"quoting\"] = \"QUOTE_NONE\"\n\n d[path.name] = _get_headers(\n str(path), py_to_nim_encoding(encoding), header_row_index=header_row_index,\n delimiter=delimiter,\n linecount=linecount,\n **kwargs\n )\n return d\n except Exception as e:\n raise ValueError(f\"can't read {path.suffix}\")\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.excel_reader_headers","title":"tablite.file_reader_utils.excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount) ","text":"Source code in tablite/file_reader_utils.py def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n d = {}\n book = openpyxl.open(str(path), read_only=True)\n\n try:\n all_sheets = book.sheetnames\n\n for sheet_name, sheet in ((name, book[name]) for name in all_sheets):\n fixup_worksheet(sheet)\n if sheet.max_row is None:\n max_rows = 0\n else:\n max_rows = min(sheet.max_row, linecount + 1)\n container = [None] * max_rows\n padding_ends = 0\n max_column = sheet.max_column\n\n for i, row_data in enumerate(sheet.iter_rows(0, header_row_index + max_rows, values_only=True), start=-header_row_index):\n if i < 0:\n # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row\n continue\n\n # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string\n container[i] = [DataTypes.to_json(v) for v in row_data]\n\n for j, cell in enumerate(reversed(row_data)):\n if cell is None:\n continue\n\n padding_ends = max(padding_ends, max_column - j)\n\n break\n\n d[sheet_name] = [None if c is None else c[0:padding_ends] for c in container]\n d[\"delimiter\"] = None\n finally:\n book.close()\n\n return d\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ods_reader_headers","title":"tablite.file_reader_utils.ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount) ","text":"Source code in tablite/file_reader_utils.py def ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n d = {\n \"delimiter\": None\n }\n sheets = pyexcel.get_book_dict(file_name=str(path))\n\n for sheet_name, data in sheets.items():\n lines = [[DataTypes.to_json(v) for v in row] for row in data[header_row_index:header_row_index+linecount]]\n\n d[sheet_name] = lines\n\n return d\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_headers","title":"tablite.file_reader_utils.get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10) ","text":"file format definition csv comma separated values tsv tab separated values csvz a zip file that contains one or many csv files tsvz a zip file that contains one or many tsv files xls a spreadsheet file format created by MS-Excel 97-2003 xlsx MS-Excel Extensions to the Office Open XML SpreadsheetML File Format. xlsm an MS-Excel Macro-Enabled Workbook file ods open document spreadsheet fods flat open document spreadsheet json java script object notation html html table of the data structure simple simple presentation rst rStructured Text presentation of the data mediawiki media wiki table Source code in tablite/file_reader_utils.py def get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10):\n \"\"\"\n file format\tdefinition\n csv\t comma separated values\n tsv\t tab separated values\n csvz\ta zip file that contains one or many csv files\n tsvz\ta zip file that contains one or many tsv files\n xls\t a spreadsheet file format created by MS-Excel 97-2003\n xlsx\tMS-Excel Extensions to the Office Open XML SpreadsheetML File Format.\n xlsm\tan MS-Excel Macro-Enabled Workbook file\n ods\t open document spreadsheet\n fods\tflat open document spreadsheet\n json\tjava script object notation\n html\thtml table of the data structure\n simple\tsimple presentation\n rst\t rStructured Text presentation of the data\n mediawiki\tmedia wiki table\n \"\"\"\n if isinstance(path, str):\n path = Path(path)\n if not isinstance(path, Path):\n raise TypeError(\"expected pathlib path.\")\n if not path.exists():\n raise FileNotFoundError(str(path))\n if delimiter is not None:\n if not isinstance(delimiter, str):\n raise TypeError(f\"expected str or None, not {type(delimiter)}\")\n\n kwargs = {\n \"path\": path,\n \"delimiter\": delimiter,\n \"header_row_index\": header_row_index,\n \"text_qualifier\": text_qualifier,\n \"linecount\": linecount\n }\n\n reader = header_readers.get(path.suffix[1:], None)\n\n if reader is None:\n raise TypeError(f\"file format for headers not supported: {path.suffix}\")\n\n result = reader(**kwargs)\n\n return result\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_encoding","title":"tablite.file_reader_utils.get_encoding(path, nbytes=ENCODING_GUESS_BYTES) ","text":"Source code in tablite/file_reader_utils.py def get_encoding(path, nbytes=ENCODING_GUESS_BYTES):\n nbytes = min(nbytes, path.stat().st_size)\n with path.open(\"rb\") as fi:\n rawdata = fi.read(nbytes)\n encoding = chardet.detect(rawdata)[\"encoding\"]\n if encoding == \"ascii\": # utf-8 is backwards compatible with ascii\n return \"utf-8\" # -- so should the first 10k chars not be enough,\n return encoding # -- the utf-8 encoding will still get it right.\n "},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_delimiter","title":"tablite.file_reader_utils.get_delimiter(path, encoding) ","text":"Source code in tablite/file_reader_utils.py def get_delimiter(path, encoding):\n with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n lines = []\n for n, line in enumerate(fi):\n line = line.rstrip(\"\\n\")\n lines.append(line)\n if n > 10:\n break # break on first\n delimiter = detect_seperator(\"\\n\".join(lines))\n if delimiter is None:\n raise ValueError(\"Delimiter could not be determined\")\n return delimiter\n "},{"location":"reference/groupby_utils/","title":"Groupby utils","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils","title":"tablite.groupby_utils ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils-classes","title":"Classes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy","title":"tablite.groupby_utils.GroupBy ","text":" Bases: object "},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy-attributes","title":"Attributes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.max","title":"tablite.groupby_utils.GroupBy.max = 'Max' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.min","title":"tablite.groupby_utils.GroupBy.min = 'Min' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.sum","title":"tablite.groupby_utils.GroupBy.sum = 'Sum' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.product","title":"tablite.groupby_utils.GroupBy.product = 'Product' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.first","title":"tablite.groupby_utils.GroupBy.first = 'First' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.last","title":"tablite.groupby_utils.GroupBy.last = 'Last' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count","title":"tablite.groupby_utils.GroupBy.count = 'Count' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count_unique","title":"tablite.groupby_utils.GroupBy.count_unique = 'CountUnique' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.avg","title":"tablite.groupby_utils.GroupBy.avg = 'Average' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.stdev","title":"tablite.groupby_utils.GroupBy.stdev = 'StandardDeviation' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.median","title":"tablite.groupby_utils.GroupBy.median = 'Median' class-attribute instance-attribute ","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.mode","title":"tablite.groupby_utils.GroupBy.mode = 'Mode' class-attribute instance-attribute ","text":""},{"location":"reference/import_utils/","title":"Import utils","text":""},{"location":"reference/import_utils/#tablite.import_utils","title":"tablite.import_utils ","text":""},{"location":"reference/import_utils/#tablite.import_utils-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.file_readers","title":"tablite.import_utils.file_readers = {'fods': excel_reader, 'json': excel_reader, 'html': from_html, 'hdf5': from_hdf5, 'simple': excel_reader, 'rst': excel_reader, 'mediawiki': excel_reader, 'xlsx': excel_reader, 'xls': excel_reader, 'xlsm': excel_reader, 'csv': text_reader, 'tsv': text_reader, 'txt': text_reader, 'ods': ods_reader} module-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.valid_readers","title":"tablite.import_utils.valid_readers = ','.join(list(file_readers.keys())) module-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils-classes","title":"Classes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig","title":"tablite.import_utils.TRconfig(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields) ","text":" Bases: object Source code in tablite/import_utils.py def __init__(\n self,\n source,\n destination,\n start,\n end,\n guess_datatypes,\n delimiter,\n text_qualifier,\n text_escape_openings,\n text_escape_closures,\n strip_leading_and_tailing_whitespace,\n encoding,\n newline_offsets,\n fields\n) -> None:\n self.source = source\n self.destination = destination\n self.start = start\n self.end = end\n self.guess_datatypes = guess_datatypes\n self.delimiter = delimiter\n self.text_qualifier = text_qualifier\n self.text_escape_openings = text_escape_openings\n self.text_escape_closures = text_escape_closures\n self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n self.encoding = encoding\n self.newline_offsets = newline_offsets\n self.fields = fields\n type_check(start, int),\n type_check(end, int),\n type_check(delimiter, str),\n type_check(text_qualifier, (str, type(None))),\n type_check(text_escape_openings, str),\n type_check(text_escape_closures, str),\n type_check(encoding, str),\n type_check(strip_leading_and_tailing_whitespace, bool),\n type_check(newline_offsets, list)\n type_check(fields, dict)\n "},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.source","title":"tablite.import_utils.TRconfig.source = source instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.destination","title":"tablite.import_utils.TRconfig.destination = destination instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.start","title":"tablite.import_utils.TRconfig.start = start instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.end","title":"tablite.import_utils.TRconfig.end = end instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.guess_datatypes","title":"tablite.import_utils.TRconfig.guess_datatypes = guess_datatypes instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.delimiter","title":"tablite.import_utils.TRconfig.delimiter = delimiter instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_qualifier","title":"tablite.import_utils.TRconfig.text_qualifier = text_qualifier instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_openings","title":"tablite.import_utils.TRconfig.text_escape_openings = text_escape_openings instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_closures","title":"tablite.import_utils.TRconfig.text_escape_closures = text_escape_closures instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace","title":"tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.encoding","title":"tablite.import_utils.TRconfig.encoding = encoding instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.newline_offsets","title":"tablite.import_utils.TRconfig.newline_offsets = newline_offsets instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.fields","title":"tablite.import_utils.TRconfig.fields = fields instance-attribute ","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.copy","title":"tablite.import_utils.TRconfig.copy() ","text":"Source code in tablite/import_utils.py def copy(self):\n return TRconfig(**self.dict())\n "},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.dict","title":"tablite.import_utils.TRconfig.dict() ","text":"Source code in tablite/import_utils.py def dict(self):\n return {k: v for k, v in self.__dict__.items() if not (k.startswith(\"_\") or callable(v))}\n "},{"location":"reference/import_utils/#tablite.import_utils-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.from_pandas","title":"tablite.import_utils.from_pandas(T, df) ","text":"Creates Table using pd.to_dict('list') similar to: import pandas as pd df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}) df a b 0 1 4 1 2 5 2 3 6 df.to_dict('list') t = Table.from_dict(df.to_dict('list)) t.show() +===+===+===+ | # | a | b | |row|int|int| +---+---+---+ | 0 | 1| 4| | 1 | 2| 5| | 2 | 3| 6| +===+===+===+ Source code in tablite/import_utils.py def from_pandas(T, df):\n \"\"\"\n Creates Table using pd.to_dict('list')\n\n similar to:\n >>> import pandas as pd\n >>> df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n >>> df\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n >>> df.to_dict('list')\n {'a': [1, 2, 3], 'b': [4, 5, 6]}\n\n >>> t = Table.from_dict(df.to_dict('list))\n >>> t.show()\n +===+===+===+\n | # | a | b |\n |row|int|int|\n +---+---+---+\n | 0 | 1| 4|\n | 1 | 2| 5|\n | 2 | 3| 6|\n +===+===+===+\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n\n return T(columns=df.to_dict(\"list\")) # noqa\n "},{"location":"reference/import_utils/#tablite.import_utils.from_hdf5","title":"tablite.import_utils.from_hdf5(T, path, tqdm=_tqdm, pbar=None) ","text":"imports an exported hdf5 table. Note that some loss of type information is to be expected in columns of mixed type: t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O | |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11| | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ Source code in tablite/import_utils.py def from_hdf5(T, path, tqdm=_tqdm, pbar=None):\n \"\"\"\n imports an exported hdf5 table.\n\n Note that some loss of type information is to be expected in columns of mixed type:\n >>> t.show(dtype=True)\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|str |mixed| bool| datetime | date | time | timedelta |str| int |float|int|\n +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1| |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1|1000|1 | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n >>> t.to_hdf5(filename)\n >>> t2 = Table.from_hdf5(filename)\n >>> t2.show(dtype=True)\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n | # | A | B | C | D | E | F | G | H | I | J | K | L | M | O |\n |row|int|mixed|float|mixed|mixed| bool| datetime | datetime | time | str |str| int |float|int|\n +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b |-100000000000000000000000| inf| 11|\n | 1 | 1| 1| 1.1| 1000| 1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n import h5py\n\n type_check(path, Path)\n t = T()\n with h5py.File(path, \"r\") as h5:\n for col_name in h5.keys():\n dset = h5[col_name]\n arr = np.array(dset[:])\n if arr.dtype == object:\n arr = np.array(DataTypes.guess([v.decode(\"utf-8\") for v in arr]))\n t[col_name] = arr\n return t\n "},{"location":"reference/import_utils/#tablite.import_utils.from_json","title":"tablite.import_utils.from_json(T, jsn) ","text":"Imports tables exported using .to_json Source code in tablite/import_utils.py def from_json(T, jsn):\n \"\"\"\n Imports tables exported using .to_json\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n import json\n\n type_check(jsn, str)\n d = json.loads(jsn)\n return T(columns=d[\"columns\"])\n "},{"location":"reference/import_utils/#tablite.import_utils.from_html","title":"tablite.import_utils.from_html(T, path, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/import_utils.py def from_html(T, path, tqdm=_tqdm, pbar=None):\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n type_check(path, Path)\n\n if pbar is None:\n total = path.stat().st_size\n pbar = tqdm(total=total, desc=\"from_html\", disable=Config.TQDM_DISABLE)\n\n row_start, row_end = \"<tr>\", \"</tr>\"\n value_start, value_end = \"<th>\", \"</th>\"\n chunk = \"\"\n t = None # will be T()\n start, end = 0, 0\n data = {}\n with path.open(\"r\") as fi:\n while True:\n start = chunk.find(row_start, start) # row tag start\n end = chunk.find(row_end, end) # row tag end\n if start == -1 or end == -1:\n new = fi.read(100_000)\n pbar.update(len(new))\n if new == \"\":\n break\n chunk += new\n continue\n # get indices from chunk\n row = chunk[start + len(row_start) : end]\n fields = [v.rstrip(value_end) for v in row.split(value_start)]\n if not data:\n headers = fields[:]\n data = {f: [] for f in headers}\n continue\n else:\n for field, header in zip(fields, headers):\n data[header].append(field)\n\n chunk = chunk[end + len(row_end) :]\n\n if len(data[headers[0]]) == Config.PAGE_SIZE:\n if t is None:\n t = T(columns=data)\n else:\n for k, v in data.items():\n t[k].extend(DataTypes.guess(v))\n data = {f: [] for f in headers}\n\n for k, v in data.items():\n t[k].extend(DataTypes.guess(v))\n return t\n "},{"location":"reference/import_utils/#tablite.import_utils.excel_reader","title":"tablite.import_utils.excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs) ","text":"returns Table from excel **kwargs are excess arguments that are ignored. Source code in tablite/import_utils.py def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs):\n \"\"\"\n returns Table from excel\n\n **kwargs are excess arguments that are ignored.\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n\n book = openpyxl.load_workbook(path, read_only=True, data_only=True)\n\n if sheet is None: # help the user.\n \"\"\"\n If no sheet specified, assume first sheet.\n\n Reasoning:\n Pandas ODS reader does that, so this preserves parity and it might be expected by users.\n If we don't know the sheet name but only have single sheet,\n we would need to take extra steps to find out the name of the sheet.\n We already make assumptions in case of column selection,\n when columns are None, we import all of them.\n \"\"\"\n sheet = book.sheetnames[0]\n elif sheet not in book.sheetnames:\n raise ValueError(f\"sheet not found: {sheet}\")\n\n if not (isinstance(start, int) and start >= 0):\n raise ValueError(\"expected start as an integer >=0\")\n if not (isinstance(limit, int) and limit > 0):\n raise ValueError(\"expected limit as integer > 0\")\n\n worksheet = book[sheet]\n fixup_worksheet(worksheet)\n\n try:\n it_header = worksheet.iter_rows(min_row=header_row_index + 1)\n while True:\n # get the first row to know our headers or the number of columns\n row = [c.value for c in next(it_header)]\n break\n fields = [str(c) if c is not None else \"\" for c in row] # excel is offset by 1\n except StopIteration:\n # excel was empty, return empty table\n return T()\n\n if not first_row_has_headers:\n # since the first row did not contain headers, we use the column count to populate header names\n fields = [str(i) for i in range(len(fields))]\n\n if columns is None:\n # no columns were specified by user to import, that means we import all of the them\n columns = []\n\n for f in fields:\n # fixup the duplicate column names\n columns.append(unique_name(f, columns))\n\n field_dict = {k: i for i, k in enumerate(columns)}\n else:\n field_dict = {}\n\n for k, i in ((k, fields.index(k)) for k in columns):\n # fixup the duplicate column names\n field_dict[unique_name(k, field_dict.keys())] = i\n\n # calculate our data rows iterator offset\n it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1\n\n # attempt to fetch number of rows in the sheet\n total_rows = worksheet.max_row\n real_tqdm = True\n\n if total_rows is None:\n # i don't know what causes it but max_row can be None in some cases, so we don't know how large the dataset is\n total_rows = it_offset + limit\n real_tqdm = False\n\n # create the actual data rows iterator\n it_rows = worksheet.iter_rows(min_row=it_offset, max_row=min(it_offset+limit, total_rows))\n it_used_indices = list(field_dict.values())\n\n # filter columns that we're not going to use\n it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows)\n\n # create page directory\n workdir = Path(Config.workdir) / Config.pid\n pagesdir = workdir/\"pages\"\n pagesdir.mkdir(exist_ok=True, parents=True)\n\n field_names = list(field_dict.keys())\n column_count = len(field_names)\n\n page_fhs = None\n\n # prepopulate the table with columns\n table = T()\n for name in field_names:\n table[name] = Column(table.path)\n\n pbar_fname = path.name\n if len(pbar_fname) > 20:\n pbar_fname = pbar_fname[0:10] + \"...\" + pbar_fname[-7:]\n\n if real_tqdm:\n # we can create a true tqdm progress bar, make one\n tqdm_iter = tqdm(it_rows_filtered, total=total_rows, desc=f\"importing excel: {pbar_fname}\")\n else:\n \"\"\"\n openpyxls was unable to precalculate the size of the excel for whatever reason\n forcing recalc would require parsing entire file\n drop the progress bar in that case, just show iterations\n\n as an alternative we can use \u03a3=1/x but it just doesn't look good, show iterations per second instead\n \"\"\"\n tqdm_iter = tqdm(it_rows_filtered, desc=f\"importing excel: {pbar_fname}\")\n\n tqdm_iter = iter(tqdm_iter)\n\n idx = 0\n\n while True:\n try:\n row = next(tqdm_iter)\n except StopIteration:\n break # because in some cases we can't know the size of excel to set the upper iterator limit we loop until stop iteration is encountered\n\n if skip_empty == \"ALL\" and all(v is None for v in row):\n continue\n elif skip_empty == \"ANY\" and any(v is None for v in row):\n continue\n\n if idx % Config.PAGE_SIZE == 0:\n if page_fhs is not None:\n # we reached the max page file size, fix the pages\n [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n page_fhs = [None] * column_count\n\n for cidx in range(column_count):\n # allocate new pages\n pg_path = pagesdir / f\"{next(Page.ids)}.npy\"\n page_fhs[cidx] = open(pg_path, \"wb\")\n\n for fh, value in zip(page_fhs, row):\n \"\"\"\n since excel types are already cast into appropriate type we're going to do two passes per page\n\n we create our temporary custom format:\n packed type|packed byte count|packed bytes|...\n\n available types:\n * q - int64\n * d - float64\n * s - string\n * b - boolean\n * n - none\n * p - pickled (date, time, datetime)\n \"\"\"\n dtype = type(value)\n\n if dtype == int:\n ptype, bytes_ = b'q', struct.pack('q', value) # pack int as int64\n elif dtype == float:\n ptype, bytes_ = b'd', struct.pack('d', value) # pack float as float64\n elif dtype == str:\n ptype, bytes_ = b's', value.encode(\"utf-8\") # pack string\n elif dtype == bool:\n ptype, bytes_ = b'b', b'1' if value else b'0' # pack boolean\n elif value is None:\n ptype, bytes_ = b'n', b'' # pack none\n elif dtype in [date, time, datetime]:\n ptype, bytes_ = b'p', pkl.dumps(value) # pack object types via pickle\n else:\n raise NotImplementedError()\n\n byte_count = struct.pack('I', len(bytes_)) # pack our payload size, i doubt payload size can be over uint32\n\n # dump object to file\n fh.write(ptype)\n fh.write(byte_count)\n fh.write(bytes_)\n\n idx = idx + 1\n\n if page_fhs is not None:\n # we reached end of the loop, fix the pages\n [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n return table\n "},{"location":"reference/import_utils/#tablite.import_utils.ods_reader","title":"tablite.import_utils.ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, **kwargs) ","text":"returns Table from .ODS Source code in tablite/import_utils.py def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, **kwargs):\n \"\"\"\n returns Table from .ODS\n \"\"\"\n if not issubclass(T, BaseTable):\n raise TypeError(\"Expected subclass of Table\")\n\n if sheet is None:\n data = read_excel(str(path), header=None) # selects first sheet\n else:\n data = read_excel(str(path), sheet_name=sheet, header=None)\n\n data[isna(data)] = None # convert any empty cells to None\n data = data.to_numpy().tolist() # convert pandas to list\n\n if skip_empty == \"ALL\" or skip_empty == \"ANY\":\n \"\"\" filter out all rows based on predicate that come after header row \"\"\"\n fn_filter = any if skip_empty == \"ALL\" else all # this is intentional\n data = [\n row\n for ridx, row in enumerate(data)\n if ridx < header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)\n ]\n\n data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes\n\n if not (isinstance(start, int) and start >= 0):\n raise ValueError(\"expected start as an integer >=0\")\n if not (isinstance(limit, int) and limit > 0):\n raise ValueError(\"expected limit as integer > 0\")\n\n t = T()\n\n used_columns_names = set()\n for ix, value in enumerate(data[header_row_index]):\n if first_row_has_headers:\n header, start_row_pos = \"\" if value is None else str(value), (1 + header_row_index)\n else:\n header, start_row_pos = f\"_{ix + 1}\", (0 + header_row_index)\n\n if columns is not None:\n if header not in columns:\n continue\n\n unique_column_name = unique_name(str(header), used_columns_names)\n used_columns_names.add(unique_column_name)\n\n column_values = data[start_row_pos : start_row_pos + limit, ix]\n\n t[unique_column_name] = column_values\n return t\n "},{"location":"reference/import_utils/#tablite.import_utils.text_reader_task","title":"tablite.import_utils.text_reader_task(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields) ","text":"PARALLEL TASK FUNCTION reads columnsname + path[start:limit] into hdf5. source: csv or txt file destination: filename for page. start: int: start of page. end: int: end of page. guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess delimiter: ',' ';' or '|' text_qualifier: str: commonly \" text_escape_openings: str: default: \"({[ text_escape_closures: str: default: ]})\" strip_leading_and_tailing_whitespace: bool encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN') Source code in tablite/import_utils.py def text_reader_task(\n source,\n destination,\n start,\n end,\n guess_datatypes,\n delimiter,\n text_qualifier,\n text_escape_openings,\n text_escape_closures,\n strip_leading_and_tailing_whitespace,\n encoding,\n newline_offsets,\n fields\n):\n \"\"\"PARALLEL TASK FUNCTION\n reads columnsname + path[start:limit] into hdf5.\n\n source: csv or txt file\n destination: filename for page.\n start: int: start of page.\n end: int: end of page.\n guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess\n delimiter: ',' ';' or '|'\n text_qualifier: str: commonly \\\"\n text_escape_openings: str: default: \"({[\n text_escape_closures: str: default: ]})\"\n strip_leading_and_tailing_whitespace: bool\n encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')\n \"\"\"\n if isinstance(source, str):\n source = Path(source)\n type_check(source, Path)\n if not source.exists():\n raise FileNotFoundError(f\"File not found: {source}\")\n type_check(destination, list)\n\n # declare CSV dialect.\n delim = delimiter\n\n class Dialect(csv.Dialect):\n delimiter = delim\n quotechar = '\"' if text_qualifier is None else text_qualifier\n escapechar = '\\\\'\n doublequote = True\n quoting = csv.QUOTE_MINIMAL\n skipinitialspace = False if strip_leading_and_tailing_whitespace is None else strip_leading_and_tailing_whitespace\n lineterminator = \"\\n\"\n\n with source.open(\"r\", encoding=encoding, errors=\"ignore\") as fi: # --READ\n fi.seek(newline_offsets[start])\n reader = csv.reader(fi, dialect=Dialect)\n\n # if there's an issue with file handlers on windows, we can make a special case for windows where the file is opened on demand and appended instead of opening all handlers at once\n page_file_handlers = [open(f, mode=\"wb\") for f in destination]\n\n # identify longest str\n longest_str = [1 for _ in range(len(destination))]\n for row in (next(reader) for _ in range(end - start)):\n for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n longest_str[idx] = max(longest_str[idx], len(c))\n\n column_formats = [f\"<U{i}\" for i in longest_str]\n for idx, cf in enumerate(column_formats):\n _create_numpy_header(cf, (end - start, ), page_file_handlers[idx])\n\n # write page arrays to files\n fi.seek(newline_offsets[start])\n for row in (next(reader) for _ in range(end - start)):\n for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n cbytes = np.asarray(c, dtype=column_formats[idx]).tobytes()\n page_file_handlers[idx].write(cbytes)\n\n [phf.close() for phf in page_file_handlers]\n "},{"location":"reference/import_utils/#tablite.import_utils.text_reader","title":"tablite.import_utils.text_reader(T, path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline, guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, delimiter, text_escape_openings, text_escape_closures, tqdm=_tqdm, **kwargs) ","text":"Source code in tablite/import_utils.py def text_reader(\n T,\n path,\n columns,\n first_row_has_headers,\n header_row_index,\n encoding,\n start,\n limit,\n newline,\n guess_datatypes,\n text_qualifier,\n strip_leading_and_tailing_whitespace,\n skip_empty,\n delimiter,\n text_escape_openings,\n text_escape_closures,\n tqdm=_tqdm,\n **kwargs,\n):\n if encoding is None:\n encoding = get_encoding(path, nbytes=ENCODING_GUESS_BYTES)\n\n enc = py_to_nim_encoding(encoding)\n pid = Config.workdir / Config.pid\n kwargs = {}\n\n if first_row_has_headers is not None:\n kwargs[\"first_row_has_headers\"] = first_row_has_headers\n if header_row_index is not None:\n kwargs[\"header_row_index\"] = header_row_index\n if columns is not None:\n kwargs[\"columns\"] = columns\n if start is not None:\n kwargs[\"start\"] = start\n if limit is not None and limit != sys.maxsize:\n kwargs[\"limit\"] = limit\n if guess_datatypes is not None:\n kwargs[\"guess_datatypes\"] = guess_datatypes\n if newline is not None:\n kwargs[\"newline\"] = newline\n if delimiter is not None:\n kwargs[\"delimiter\"] = delimiter\n if text_qualifier is not None:\n kwargs[\"text_qualifier\"] = text_qualifier\n kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n else:\n kwargs[\"quoting\"] = \"QUOTE_NONE\"\n if strip_leading_and_tailing_whitespace is not None:\n kwargs[\"strip_leading_and_tailing_whitespace\"] = strip_leading_and_tailing_whitespace\n\n if skip_empty is None:\n kwargs[\"skip_empty\"] = \"NONE\"\n else:\n kwargs[\"skip_empty\"] = skip_empty\n\n return nimlite.text_reader(\n T, pid, path, enc,\n **kwargs,\n tqdm=tqdm\n )\n "},{"location":"reference/import_utils/#tablite.import_utils-modules","title":"Modules","text":""},{"location":"reference/imputation/","title":"Imputation","text":""},{"location":"reference/imputation/#tablite.imputation","title":"tablite.imputation ","text":""},{"location":"reference/imputation/#tablite.imputation-classes","title":"Classes","text":""},{"location":"reference/imputation/#tablite.imputation-functions","title":"Functions","text":""},{"location":"reference/imputation/#tablite.imputation.imputation","title":"tablite.imputation.imputation(T, targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm, pbar=None) ","text":"In statistics, imputation is the process of replacing missing data with substituted values. See more: https://en.wikipedia.org/wiki/Imputation_(statistics) PARAMETER DESCRIPTION table source table. TYPE: Table targets column names to find and replace missing values TYPE: str or list of strings missing values to be replaced. TYPE: None or iterable DEFAULT: None method method to be used for replacement. Options: 'carry forward': takes the previous value, and carries forward into fields where values are missing. +: quick. Realistic on time series. -: Can produce strange outliers. 'mean': calculates the column mean (exclude missing ) and copies the mean in as replacement. +: quick -: doesn't work on text. Causes data set to drift towards the mean. 'mode': calculates the column mode (exclude missing ) and copies the mean in as replacement. +: quick -: most frequent value becomes over-represented in the sample 'nearest neighbour': calculates normalised distance between items in source columns selects nearest neighbour and copies value as replacement. +: works for any datatype. -: computationally intensive (e.g. slow) TYPE: str DEFAULT: 'carry forward' sources NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used. TYPE: list of strings DEFAULT: None RETURNS DESCRIPTION table table with replaced values. Source code in tablite/imputation.py def imputation(T, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm, pbar=None):\n \"\"\"\n In statistics, imputation is the process of replacing missing data with substituted values.\n\n See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n Args:\n table (Table): source table.\n\n targets (str or list of strings): column names to find and\n replace missing values\n\n missing (None or iterable): values to be replaced.\n\n method (str): method to be used for replacement. Options:\n\n 'carry forward':\n takes the previous value, and carries forward into fields\n where values are missing.\n +: quick. Realistic on time series.\n -: Can produce strange outliers.\n\n 'mean':\n calculates the column mean (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: doesn't work on text. Causes data set to drift towards the mean.\n\n 'mode':\n calculates the column mode (exclude `missing`) and copies\n the mean in as replacement.\n +: quick\n -: most frequent value becomes over-represented in the sample\n\n 'nearest neighbour':\n calculates normalised distance between items in source columns\n selects nearest neighbour and copies value as replacement.\n +: works for any datatype.\n -: computationally intensive (e.g. slow)\n\n sources (list of strings): NEAREST NEIGHBOUR ONLY\n column names to be used during imputation.\n if None or empty, all columns will be used.\n\n Returns:\n table: table with replaced values.\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if isinstance(targets, str) and targets not in T.columns:\n targets = [targets]\n if isinstance(targets, list):\n for name in targets:\n if not isinstance(name, str):\n raise TypeError(f\"expected str, not {type(name)}\")\n if name not in T.columns:\n raise ValueError(f\"target item {name} not a column name in T.columns:\\n{T.columns}\")\n else:\n raise TypeError(\"Expected source as list of column names\")\n\n if missing is None:\n missing = {None}\n else:\n missing = set(missing)\n\n if method == \"nearest neighbour\":\n if sources in (None, []):\n sources = list(T.columns)\n if isinstance(sources, str):\n sources = [sources]\n if isinstance(sources, list):\n for name in sources:\n if not isinstance(name, str):\n raise TypeError(f\"expected str, not {type(name)}\")\n if name not in T.columns:\n raise ValueError(f\"source item {name} not a column name in T.columns:\\n{T.columns}\")\n else:\n raise TypeError(\"Expected source as list of column names\")\n\n methods = [\"nearest neighbour\", \"mean\", \"mode\", \"carry forward\"]\n\n if method == \"carry forward\":\n return carry_forward(T, targets, missing, tqdm=tqdm, pbar=pbar)\n elif method in {\"mean\", \"mode\"}:\n return stats_method(T, targets, missing, method, tqdm=tqdm, pbar=pbar)\n elif method == \"nearest neighbour\":\n return nearest_neighbour(T, sources, missing, targets, tqdm=tqdm)\n else:\n raise ValueError(f\"method {method} not recognised amonst known methods: {list(methods)})\")\n "},{"location":"reference/imputation/#tablite.imputation.carry_forward","title":"tablite.imputation.carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/imputation.py def carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None):\n assert isinstance(missing, set)\n\n if pbar is None:\n total = len(targets) * len(T)\n pbar = tqdm(total=total, desc=\"imputation.carry_forward\", disable=Config.TQDM_DISABLE)\n\n new = T.copy()\n for name in T.columns:\n if name in targets:\n data = T[name][:] # create copy\n last_value = None\n for ix, v in enumerate(data):\n if v in missing: # perform replacement\n data[ix] = last_value\n else: # keep last value.\n last_value = v\n pbar.update(1)\n new[name] = data\n else:\n new[name] = T[name]\n\n return new\n "},{"location":"reference/imputation/#tablite.imputation.stats_method","title":"tablite.imputation.stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/imputation.py def stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None):\n assert isinstance(missing, set)\n\n if pbar is None:\n total = len(targets)\n pbar = tqdm(total=total, desc=f\"imputation.{method}\", disable=Config.TQDM_DISABLE)\n\n new = T.copy()\n for name in T.columns:\n if name in targets:\n col = T.columns[name]\n assert isinstance(col, Column)\n\n hist_values, hist_counts = col.histogram()\n\n for m in missing:\n try:\n idx = hist_values.index(m)\n hist_counts[idx] = 0\n except ValueError:\n pass\n\n stats = summary_statistics(hist_values, hist_counts)\n\n new_value = stats[method]\n col.replace(mapping={m: new_value for m in missing})\n new[name] = col\n pbar.update(1)\n else:\n new[name] = T[name] # no entropy, keep as is.\n\n return new\n "},{"location":"reference/imputation/#tablite.imputation-modules","title":"Modules","text":""},{"location":"reference/joins/","title":"Joins","text":""},{"location":"reference/joins/#tablite.joins","title":"tablite.joins ","text":""},{"location":"reference/joins/#tablite.joins-classes","title":"Classes","text":""},{"location":"reference/joins/#tablite.joins-functions","title":"Functions","text":""},{"location":"reference/joins/#tablite.joins.join","title":"tablite.joins.join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], kind: str = 'inner', merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"short-cut for all join functions. PARAMETER DESCRIPTION T left table TYPE: Table other right table TYPE: Table left_keys list of keys for the join from left table. TYPE: list right_keys list of keys for the join from right table. TYPE: list left_columns list of columns names to retain from left table. If None, all are retained. TYPE: list right_columns list of columns names to retain from right table. If None, all are retained. TYPE: list kind 'inner', 'left', 'outer', 'cross'. Defaults to \"inner\". TYPE: str DEFAULT: 'inner' tqdm tqdm progress counter. Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm pbar tqdm.progressbar. Defaults to None. TYPE: pbar DEFAULT: None RAISES DESCRIPTION ValueError if join type is unknown. RETURNS DESCRIPTION Table joined table. Example: \"inner\" SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n Tablite: >>> inner_join = numbers.inner_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n)\n Example: \"left\" SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n Tablite: >>> left_join = numbers.left_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n)\n Example: \"outer\" SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n Tablite: >>> outer_join = numbers.outer_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n Example: \"cross\" CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table Source code in tablite/joins.py def join(\n T: BaseTable,\n other: BaseTable,\n left_keys: List[str],\n right_keys: List[str],\n left_columns: Union[List[str], None],\n right_columns: Union[List[str], None],\n kind: str = \"inner\",\n merge_keys: bool = False,\n tqdm=_tqdm,\n pbar=None,\n):\n \"\"\"short-cut for all join functions.\n\n Args:\n T (Table): left table\n other (Table): right table\n left_keys (list): list of keys for the join from left table.\n right_keys (list): list of keys for the join from right table.\n left_columns (list): list of columns names to retain from left table.\n If None, all are retained.\n right_columns (list): list of columns names to retain from right table.\n If None, all are retained.\n kind (str, optional): 'inner', 'left', 'outer', 'cross'. Defaults to \"inner\".\n tqdm (tqdm, optional): tqdm progress counter. Defaults to _tqdm.\n pbar (tqdm.pbar, optional): tqdm.progressbar. Defaults to None.\n\n Raises:\n ValueError: if join type is unknown.\n\n Returns:\n Table: joined table.\n\n Example: \"inner\"\n ```\n SQL: SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n ```\n Tablite: \n ```\n >>> inner_join = numbers.inner_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n ```\n\n Example: \"left\" \n ```\n SQL: SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n ```\n Tablite: \n ```\n >>> left_join = numbers.left_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n ```\n\n Example: \"outer\"\n ```\n SQL: SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n ```\n\n Tablite: \n ```\n >>> outer_join = numbers.outer_join(\n letters, \n left_keys=['colour'], \n right_keys=['color'], \n left_columns=['number'], \n right_columns=['letter']\n )\n ```\n\n Example: \"cross\"\n\n CROSS JOIN returns the Cartesian product of rows from tables in the join.\n In other words, it will produce rows which combine each row from the first table\n with each row from the second table\n \"\"\"\n if left_columns is None:\n left_columns = list(T.columns)\n if right_columns is None:\n right_columns = list(other.columns)\n assert merge_keys in {True,False}\n\n _jointype_check(T, other, left_keys, right_keys, left_columns, right_columns)\n\n return _join(kind, T,other,left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys,\n tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.inner_join","title":"tablite.joins.inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"inner\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.left_join","title":"tablite.joins.left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"left\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.outer_join","title":"tablite.joins.outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"outer\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/joins/#tablite.joins.cross_join","title":"tablite.joins.cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None) ","text":"Source code in tablite/joins.py def cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"cross\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n "},{"location":"reference/lookup/","title":"Lookup","text":""},{"location":"reference/lookup/#tablite.lookup","title":"tablite.lookup ","text":""},{"location":"reference/lookup/#tablite.lookup-attributes","title":"Attributes","text":""},{"location":"reference/lookup/#tablite.lookup-classes","title":"Classes","text":""},{"location":"reference/lookup/#tablite.lookup-functions","title":"Functions","text":""},{"location":"reference/lookup/#tablite.lookup.lookup","title":"tablite.lookup.lookup(T, other, *criteria, all=True, tqdm=_tqdm) ","text":"function for looking up values in other according to criteria in ascending order. :param: T: Table :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=ANY OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare. Examples: comparison of two columns: ('column A', \"==\", 'column B')\n compare value from column 'Date' with date 24/12. ('Date', \"<\", DataTypes.date(24,12) )\n uses custom function to compare value from column 'text 1' with value from column 'text 2' f = lambda L,R: all( ord(L) < ord(R) )\n('text 1', f, 'text 2')\n Source code in tablite/lookup.py def lookup(T, other, *criteria, all=True, tqdm=_tqdm):\n \"\"\"function for looking up values in `other` according to criteria in ascending order.\n :param: T: Table \n :param: other: Table sorted in ascending search order.\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n (LEFT, OPERATOR, RIGHT)\n :param: all: boolean: True=ALL, False=ANY\n\n OPERATOR must be a callable that returns a boolean\n LEFT must be a value that the OPERATOR can compare.\n RIGHT must be a value that the OPERATOR can compare.\n\n Examples:\n comparison of two columns:\n\n ('column A', \"==\", 'column B')\n\n compare value from column 'Date' with date 24/12.\n\n ('Date', \"<\", DataTypes.date(24,12) )\n\n uses custom function to compare value from column\n 'text 1' with value from column 'text 2'\n\n f = lambda L,R: all( ord(L) < ord(R) )\n ('text 1', f, 'text 2')\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n sub_cls_check(other, BaseTable)\n\n all = all\n any = not all\n\n ops = lookup_ops\n\n functions, left_criteria, right_criteria = [], set(), set()\n\n for left, op, right in criteria:\n left_criteria.add(left)\n right_criteria.add(right)\n if callable(op):\n pass # it's a custom function.\n else:\n op = ops.get(op, None)\n if not callable(op):\n raise ValueError(f\"{op} not a recognised operator for comparison.\")\n\n functions.append((op, left, right))\n left_columns = [n for n in left_criteria if n in T.columns]\n right_columns = [n for n in right_criteria if n in other.columns]\n\n result_index = np.empty(shape=(len(T)), dtype=np.int64)\n cache = {}\n left = T[left_columns]\n Constr = type(T)\n if isinstance(left, Column):\n tmp, left = left, Constr()\n left[left_columns[0]] = tmp\n right = other[right_columns]\n if isinstance(right, Column):\n tmp, right = right, Constr()\n right[right_columns[0]] = tmp\n assert isinstance(left, BaseTable)\n assert isinstance(right, BaseTable)\n\n for ix, row1 in tqdm(enumerate(left.rows), total=len(T), disable=Config.TQDM_DISABLE):\n row1_tup = tuple(row1)\n row1d = {name: value for name, value in zip(left_columns, row1)}\n row1_hash = hash(row1_tup)\n\n match_found = True if row1_hash in cache else False\n\n if not match_found: # search.\n for row2ix, row2 in enumerate(right.rows):\n row2d = {name: value for name, value in zip(right_columns, row2)}\n\n evaluations = {op(row1d.get(left, left), row2d.get(right, right)) for op, left, right in functions}\n # The evaluations above does a neat trick:\n # as L is a dict, L.get(left, L) will return a value\n # from the columns IF left is a column name. If it isn't\n # the function will treat left as a value.\n # The same applies to right.\n all_ = all and (False not in evaluations)\n any_ = any and True in evaluations\n if all_ or any_:\n match_found = True\n cache[row1_hash] = row2ix\n break\n\n if not match_found: # no match found.\n cache[row1_hash] = -1 # -1 is replacement for None in the index as numpy can't handle Nones.\n\n result_index[ix] = cache[row1_hash]\n\n f = select_processing_method(2 * max(len(T), len(other)), _sp_lookup, _mp_lookup)\n return f(T, other, result_index)\n "},{"location":"reference/match/","title":"Match","text":""},{"location":"reference/match/#tablite.match","title":"tablite.match ","text":""},{"location":"reference/match/#tablite.match-classes","title":"Classes","text":""},{"location":"reference/match/#tablite.match-functions","title":"Functions","text":""},{"location":"reference/match/#tablite.match.match","title":"tablite.match.match(T, other, *criteria, keep_left=None, keep_right=None) ","text":"performs inner join where T matches other and removes rows that do not match. :param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form: (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n :param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep. Source code in tablite/match.py def match(T, other, *criteria, keep_left=None, keep_right=None): # lookup and filter combined - drops unmatched rows.\n \"\"\"\n performs inner join where `T` matches `other` and removes rows that do not match.\n\n :param: T: Table\n :param: other: Table\n :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n Example:\n ('column A', \"==\", 'column B')\n\n This syntax follows the lookup syntax. See Lookup for details.\n\n :param: keep_left: list of columns to keep.\n :param: keep_right: list of right columns to keep.\n \"\"\"\n assert isinstance(T, BaseTable)\n assert isinstance(other, BaseTable)\n if keep_left is None:\n keep_left = [n for n in T.columns]\n else:\n type_check(keep_left, list)\n name_check(T.columns, *keep_left)\n\n if keep_right is None:\n keep_right = [n for n in other.columns]\n else:\n type_check(keep_right, list)\n name_check(other.columns, *keep_right)\n\n indices = np.full(shape=(len(T),), fill_value=-1, dtype=np.int64)\n for arg in criteria:\n b,_,a = arg\n if _ != \"==\":\n raise ValueError(\"match requires A == B. For other logic visit `lookup`\")\n if b not in T.columns:\n raise ValueError(f\"Column {b} not found in T for criteria: {arg}\")\n if a not in other.columns:\n raise ValueError(f\"Column {a} not found in T for criteria: {arg}\")\n\n index_update = find_indices(other[a][:], T[b][:], fill_value=-1)\n indices = merge_indices(indices, index_update)\n\n cls = type(T)\n new = cls()\n for name in T.columns:\n if name in keep_left:\n new[name] = np.compress(indices != -1, T[name][:])\n\n for name in other.columns:\n if name in keep_right:\n new_name = unique_name(name, new.columns)\n primary = np.compress(indices != -1, indices)\n new[new_name] = np.take(other[name][:], primary)\n\n return new\n "},{"location":"reference/match/#tablite.match.find_indices","title":"tablite.match.find_indices(x, y, fill_value=-1) ","text":"finds index of y in x Source code in tablite/match.py def find_indices(x,y, fill_value=-1): # fast.\n \"\"\"\n finds index of y in x\n \"\"\"\n # disassembly of numpy:\n # import numpy as np\n # x = np.array([3, 5, 7, 1, 9, 8, 6, 6])\n # y = np.array([2, 1, 5, 10, 100, 6])\n index = np.argsort(x) # array([3, 0, 1, 6, 7, 2, 5, 4])\n sorted_x = x[index] # array([1, 3, 5, 6, 6, 7, 8, 9])\n sorted_index = np.searchsorted(sorted_x, y) # array([1, 0, 2, 8, 8, 3])\n yindex = np.take(index, sorted_index, mode=\"clip\") # array([0, 3, 1, 4, 4, 6])\n mask = x[yindex] != y # array([ True, False, False, True, True, False])\n indices = np.ma.array(yindex, mask=mask, fill_value=fill_value) \n # masked_array(data=[--, 3, 1, --, --, 6], mask=[ True, False, False, True, True, False], fill_value=999999)\n # --: y[0] not in x\n # 3 : y[1] == x[3]\n # 1 : y[2] == x[1]\n # --: y[3] not in x\n # --: y[4] not in x\n # --: y[5] == x[6]\n result = np.where(~indices.mask, indices.data, -1) \n return result # array([-1, 3, 1, -1, -1, 6])\n "},{"location":"reference/match/#tablite.match.merge_indices","title":"tablite.match.merge_indices(x1, *args, fill_value=-1) ","text":"merges x1 and x2 where Source code in tablite/match.py def merge_indices(x1, *args, fill_value=-1):\n \"\"\"\n merges x1 and x2 where \n \"\"\"\n # dis:\n # >>> AA = array([-1, 3, -1, 5])\n # >>> BB = array([-1, -1, 4, 5])\n new = x1[:] # = AA\n for arg in args:\n mask = (new == fill_value) # array([True, False, True, False])\n new = np.where(mask, arg, new) # array([-1, 3, 4, 5])\n return new # array([-1, 3, 4, 5])\n "},{"location":"reference/merge/","title":"Merge","text":""},{"location":"reference/merge/#tablite.merge","title":"tablite.merge ","text":""},{"location":"reference/merge/#tablite.merge-classes","title":"Classes","text":""},{"location":"reference/merge/#tablite.merge-functions","title":"Functions","text":""},{"location":"reference/merge/#tablite.merge.where","title":"tablite.merge.where(T, criteria, left, right, new) ","text":"takes from LEFT where criteria is True else RIGHT and creates a single new column. :param: T: Table :param: criteria: np.array(bool): if True take left column else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name :returns: T Source code in tablite/merge.py def where(T, criteria, left, right, new):\n \"\"\" takes from LEFT where criteria is True else RIGHT \n and creates a single new column.\n\n :param: T: Table\n :param: criteria: np.array(bool): \n if True take left column\n else take right column\n :param left: (str) column name\n :param right: (str) column name\n :param new: (str) new name\n\n :returns: T\n \"\"\"\n type_check(T, BaseTable)\n if isinstance(criteria, np.ndarray):\n if not criteria.dtype == \"bool\":\n raise TypeError\n else:\n criteria = np.array(criteria, dtype='bool')\n\n new_uq = unique_name(new, list(T.columns))\n T.add_column(new_uq)\n col = T[new_uq]\n\n for start,end in Config.page_steps(len(criteria)):\n left_values = T[left][start:end]\n right_values = T[right][start:end]\n new_values = np.where(criteria, left_values, right_values)\n col.extend(new_values)\n\n if new == right:\n T[right] = T[new_uq] # keep column order\n del T[new_uq]\n del T[left]\n elif new == left:\n T[left] = T[new_uq] # keep column order\n del T[new_uq]\n del T[right]\n else:\n T[new] = T[new_uq]\n del T[left]\n del T[right]\n return T\n "},{"location":"reference/mp_utils/","title":"Mp utils","text":""},{"location":"reference/mp_utils/#tablite.mp_utils","title":"tablite.mp_utils ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-attributes","title":"Attributes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.lookup_ops","title":"tablite.mp_utils.lookup_ops = {'in': _in, 'not in': not_in, '<': operator.lt, '<=': operator.le, '>': operator.gt, '>=': operator.ge, '!=': operator.ne, '==': operator.eq} module-attribute ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops","title":"tablite.mp_utils.filter_ops = {'>': operator.gt, '>=': operator.ge, '==': operator.eq, '<': operator.lt, '<=': operator.le, '!=': operator.ne, 'in': _in} module-attribute ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops_from_text","title":"tablite.mp_utils.filter_ops_from_text = {'gt': '>', 'gteq': '>=', 'eq': '==', 'lt': '<', 'lteq': '<=', 'neq': '!=', 'in': _in} module-attribute ","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-classes","title":"Classes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-functions","title":"Functions","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.not_in","title":"tablite.mp_utils.not_in(a, b) ","text":"Source code in tablite/mp_utils.py def not_in(a, b):\n return not operator.contains(str(a), str(b))\n "},{"location":"reference/mp_utils/#tablite.mp_utils.is_mp","title":"tablite.mp_utils.is_mp(fields: int) -> bool ","text":"PARAMETER DESCRIPTION fields number of fields TYPE: int RETURNS DESCRIPTION bool bool Source code in tablite/mp_utils.py def is_mp(fields: int) -> bool:\n \"\"\"\n\n Args:\n fields (int): number of fields\n\n Returns:\n bool\n \"\"\"\n if Config.MULTIPROCESSING_MODE == Config.FORCE:\n return True\n\n if Config.MULTIPROCESSING_MODE == Config.FALSE:\n return False\n\n if fields < Config.SINGLE_PROCESSING_LIMIT:\n return False\n\n if max(psutil.cpu_count(logical=False), 1) < 2:\n return False\n\n return True\n "},{"location":"reference/mp_utils/#tablite.mp_utils.select_processing_method","title":"tablite.mp_utils.select_processing_method(fields, sp, mp) ","text":"PARAMETER DESCRIPTION fields number of fields TYPE: int sp method for single processing TYPE: callable mp method for multiprocessing TYPE: callable RETURNS DESCRIPTION _type_ description Source code in tablite/mp_utils.py def select_processing_method(fields, sp, mp):\n \"\"\"\n\n Args:\n fields (int): number of fields\n sp (callable): method for single processing\n mp (callable): method for multiprocessing\n\n Returns:\n _type_: _description_\n \"\"\"\n return mp if is_mp(fields) else sp\n "},{"location":"reference/mp_utils/#tablite.mp_utils.maskify","title":"tablite.mp_utils.maskify(arr) ","text":"Source code in tablite/mp_utils.py def maskify(arr):\n none_mask = [False] * len(arr) # Setting the default\n\n for i in range(len(arr)):\n if arr[i] is None: # Check if our value is None\n none_mask[i] = True\n arr[i] = 0 # Remove None from the original array\n\n return none_mask\n "},{"location":"reference/mp_utils/#tablite.mp_utils.share_mem","title":"tablite.mp_utils.share_mem(inp_arr, dtype) ","text":"Source code in tablite/mp_utils.py def share_mem(inp_arr, dtype):\n len_ = len(inp_arr)\n size = np.dtype(dtype).itemsize * len_\n shape = (len_,)\n\n out_shm = shared_memory.SharedMemory(create=True, size=size) # the co_processors will read this.\n out_arr_index = np.ndarray(shape, dtype=dtype, buffer=out_shm.buf)\n out_arr_index[:] = inp_arr\n\n return out_arr_index, out_shm\n "},{"location":"reference/mp_utils/#tablite.mp_utils.map_task","title":"tablite.mp_utils.map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end) ","text":"Source code in tablite/mp_utils.py def map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end):\n # connect\n shared_data = shared_memory.SharedMemory(name=data_shm_name)\n data = np.ndarray(shape, dtype=dtype, buffer=shared_data.buf)\n\n shared_index = shared_memory.SharedMemory(name=index_shm_name)\n index = np.ndarray(shape, dtype=np.int64, buffer=shared_index.buf)\n\n shared_target = shared_memory.SharedMemory(name=destination_shm_name)\n target = np.ndarray(shape, dtype=dtype, buffer=shared_target.buf)\n # work\n target[start:end] = np.take(data[start:end], index[start:end])\n # disconnect\n shared_data.close()\n shared_index.close()\n shared_target.close()\n "},{"location":"reference/mp_utils/#tablite.mp_utils.reindex_task","title":"tablite.mp_utils.reindex_task(src, dst, index_shm, shm_shape, start, end) ","text":"Source code in tablite/mp_utils.py def reindex_task(src, dst, index_shm, shm_shape, start, end):\n # connect\n existing_shm = shared_memory.SharedMemory(name=index_shm)\n shared_index = np.ndarray(shm_shape, dtype=np.int64, buffer=existing_shm.buf)\n # work\n array = load_numpy(src)\n new = np.take(array, shared_index[start:end])\n np.save(dst, new, allow_pickle=True, fix_imports=False)\n # disconnect\n existing_shm.close()\n "},{"location":"reference/nimlite/","title":"Nimlite","text":""},{"location":"reference/nimlite/#tablite.nimlite","title":"tablite.nimlite ","text":""},{"location":"reference/nimlite/#tablite.nimlite-attributes","title":"Attributes","text":""},{"location":"reference/nimlite/#tablite.nimlite.paths","title":"tablite.nimlite.paths = sys.argv[:] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.K","title":"tablite.nimlite.K = TypeVar('K', bound=BaseTable) module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidEncoders","title":"tablite.nimlite.ValidEncoders = Literal['ENC_UTF8', 'ENC_UTF16', 'ENC_WIN1250'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidQuoting","title":"tablite.nimlite.ValidQuoting = Literal['QUOTE_MINIMAL', 'QUOTE_ALL', 'QUOTE_NONNUMERIC', 'QUOTE_NONE', 'QUOTE_STRINGS', 'QUOTE_NOTNULL'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidSkipEmpty","title":"tablite.nimlite.ValidSkipEmpty = Literal['NONE', 'ANY', 'ALL'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.ColumnSelectorDict","title":"tablite.nimlite.ColumnSelectorDict = TypedDict('ColumnSelectorDict', {'column': str, 'type': Literal['int', 'float', 'bool', 'str', 'date', 'time', 'datetime'], 'allow_empty': Union[bool, None], 'rename': Union[str, None]}) module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterCriteria","title":"tablite.nimlite.FilterCriteria = Literal['>', '>=', '==', '<', '<=', '!=', 'in'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterType","title":"tablite.nimlite.FilterType = Literal['all', 'any'] module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterDict","title":"tablite.nimlite.FilterDict = TypedDict('FilterDict', {'column1': str, 'value1': Union[str, None], 'criteria': FilterCriteria, 'column2': str, 'value2': Union[str, None]}) module-attribute ","text":""},{"location":"reference/nimlite/#tablite.nimlite-classes","title":"Classes","text":""},{"location":"reference/nimlite/#tablite.nimlite-functions","title":"Functions","text":""},{"location":"reference/nimlite/#tablite.nimlite.get_headers","title":"tablite.nimlite.get_headers(path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, header_row_index: int = 0, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, linecount: int = 10) -> list[list[str]] ","text":"Source code in tablite/nimlite.py def get_headers(\n path: Union[str, Path],\n encoding: ValidEncoders =\"ENC_UTF8\",\n *,\n header_row_index: int=0,\n newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True,\n linecount: int = 10\n) -> list[list[str]]:\n return nl.get_headers(\n path=str(path),\n encoding=encoding,\n newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n header_row_index=header_row_index,\n quoting=quoting,\n linecount=linecount\n )\n "},{"location":"reference/nimlite/#tablite.nimlite.text_reader","title":"tablite.nimlite.text_reader(T: Type[K], pid: str, path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, first_row_has_headers: bool = True, header_row_index: int = 0, columns: List[Union[str, None]] = None, start: Union[str, None] = None, limit: Union[str, None] = None, guess_datatypes: bool = False, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -> K ","text":"Source code in tablite/nimlite.py def text_reader(\n T: Type[K],\n pid: str, path: Union[str, Path],\n encoding: ValidEncoders =\"ENC_UTF8\",\n *,\n first_row_has_headers: bool=True, header_row_index: int=0,\n columns: List[Union[str, None]]=None,\n start: Union[str, None] = None, limit: Union[str, None]=None,\n guess_datatypes: bool =False,\n newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True, skip_empty: ValidSkipEmpty = \"NONE\",\n tqdm=_tqdm\n) -> K:\n assert isinstance(path, Path)\n assert isinstance(pid, Path)\n with tqdm(total=10, desc=f\"importing file\") as pbar:\n table = nl.text_reader(\n pid=str(pid),\n path=str(path),\n encoding=encoding,\n first_row_has_headers=first_row_has_headers, header_row_index=header_row_index,\n columns=columns,\n start=start, limit=limit,\n guess_datatypes=guess_datatypes,\n newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n quoting=quoting,\n strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n skip_empty=skip_empty,\n page_size=Config.PAGE_SIZE\n )\n\n pbar.update(1)\n\n task_info = table[\"task\"]\n task_columns = table[\"columns\"]\n\n ti_tasks = task_info[\"tasks\"]\n ti_import_field_names = task_info[\"import_field_names\"]\n\n is_windows = platform.system() == \"Windows\"\n use_logical = False if is_windows else True\n\n cpus = max(psutil.cpu_count(logical=use_logical), 1)\n\n pbar_step = 4 / max(len(ti_tasks), 1)\n\n class WrapUpdate:\n def update(self, n):\n pbar.update(n * pbar_step)\n\n wrapped_pbar = WrapUpdate()\n\n def next_task(task: Task, page_info):\n wrapped_pbar.update(1)\n return Task(\n nl.text_reader_task,\n *task.args, **task.kwargs, page_info=page_info\n )\n\n tasks = [\n TaskChain(\n Task(\n nl.collect_text_reader_page_info_task,\n task=t,\n task_info=task_info\n ), next_task=next_task\n ) for t in ti_tasks\n ]\n\n is_sp = False\n\n if Config.MULTIPROCESSING_MODE == Config.FALSE:\n is_sp = True\n elif Config.MULTIPROCESSING_MODE == Config.FORCE:\n is_sp = False\n elif Config.MULTIPROCESSING_MODE == Config.AUTO and cpus <= 1 or len(tasks) <= 1:\n is_sp = True\n\n if is_sp:\n res = []\n\n for task in tasks:\n page = task.execute()\n\n res.append(page)\n else:\n with TaskManager(cpus, error_mode=\"exception\") as tm:\n res = tm.execute(tasks, pbar=wrapped_pbar)\n\n col_path = pid\n column_dict = {\n cols: Column(col_path)\n for cols in ti_import_field_names\n }\n\n for res_pages in res:\n col_map = {\n n: res_pages[i]\n for i, n in enumerate(ti_import_field_names)\n }\n\n for k, c in column_dict.items():\n c.pages.append(col_map[k])\n\n if columns is None:\n columns = [c[\"name\"] for c in task_columns]\n\n table_dict = {\n a[\"name\"]: column_dict[b]\n for a, b in zip(task_columns, columns)\n }\n\n pbar.update(pbar.total - pbar.n)\n\n table = T(columns=table_dict)\n\n return table\n "},{"location":"reference/nimlite/#tablite.nimlite.wrap","title":"tablite.nimlite.wrap(str_: str) -> str ","text":"Source code in tablite/nimlite.py def wrap(str_: str) -> str:\n return '\"' + str_.replace('\"', '\\\\\"').replace(\"'\", \"\\\\'\").replace(\"\\n\", \"\\\\n\").replace(\"\\t\", \"\\\\t\") + '\"'\n "},{"location":"reference/nimlite/#tablite.nimlite.column_select","title":"tablite.nimlite.column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -> Tuple[K, K] ","text":"Source code in tablite/nimlite.py def column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -> Tuple[K, K]:\n with tqdm(total=100, desc=\"column select\", bar_format='{desc}: {percentage:.1f}%|{bar}{r_bar}') as pbar:\n T = type(table)\n dir_pid = Config.workdir / Config.pid\n\n col_infos = nl.collect_column_select_info(table, cols, str(dir_pid), pbar)\n\n columns = col_infos[\"columns\"]\n page_count = col_infos[\"page_count\"]\n is_correct_type = col_infos[\"is_correct_type\"]\n desired_column_map = col_infos[\"desired_column_map\"]\n original_pages_map = col_infos[\"original_pages_map\"]\n passed_column_data = col_infos[\"passed_column_data\"]\n failed_column_data = col_infos[\"failed_column_data\"]\n res_cols_pass = col_infos[\"res_cols_pass\"]\n res_cols_fail = col_infos[\"res_cols_fail\"]\n column_names = col_infos[\"column_names\"]\n reject_reason_name = col_infos[\"reject_reason_name\"]\n\n if all(is_correct_type.values()):\n tbl_pass_columns = {\n desired_name: table[desired_info[0]]\n for desired_name, desired_info in desired_column_map.items()\n }\n\n tbl_fail_columns = {\n desired_name: []\n for desired_name in failed_column_data\n }\n\n tbl_pass = T(columns=tbl_pass_columns)\n tbl_fail = T(columns=tbl_fail_columns)\n\n return (tbl_pass, tbl_fail)\n\n task_list_inp = (\n _collect_cs_info(i, columns, res_cols_pass, res_cols_fail, original_pages_map)\n for i in range(page_count)\n )\n\n page_size = Config.PAGE_SIZE\n\n tasks = (\n Task(\n nl.do_slice_convert, str(dir_pid), page_size, columns, reject_reason_name, res_pass, res_fail, desired_column_map, column_names, is_correct_type\n )\n for columns, res_pass, res_fail in task_list_inp\n )\n\n cpu_count = max(psutil.cpu_count(), 1)\n\n if Config.MULTIPROCESSING_MODE == Config.FORCE:\n is_mp = True\n elif Config.MULTIPROCESSING_MODE == Config.FALSE:\n is_mp = False\n elif Config.MULTIPROCESSING_MODE == Config.AUTO:\n is_multithreaded = cpu_count > 1\n is_multipage = page_count > 1\n\n is_mp = is_multithreaded and is_multipage\n\n tbl_pass = T({k: [] for k in passed_column_data})\n tbl_fail = T({k: [] for k in failed_column_data})\n\n converted = []\n step_size = 45 / max(page_count, 1)\n\n if is_mp:\n class WrapUpdate:\n def update(self, n):\n pbar.update(n * step_size)\n\n with TaskManager(min(cpu_count, page_count), error_mode=\"exception\") as tm:\n res = tm.execute(list(tasks), pbar=WrapUpdate())\n\n converted.extend(res)\n else:\n for task in tasks:\n res = task.f(*task.args, **task.kwargs)\n\n converted.append(res)\n pbar.update(step_size)\n\n def extend_table(table, columns):\n for (col_name, pg) in columns:\n table[col_name].pages.append(pg)\n\n for pg_pass, pg_fail in converted:\n extend_table(tbl_pass, pg_pass)\n extend_table(tbl_fail, pg_fail)\n\n pbar.update(pbar.total - pbar.n)\n\n return tbl_pass, tbl_fail\n "},{"location":"reference/nimlite/#tablite.nimlite.read_page","title":"tablite.nimlite.read_page(path: Union[str, Path]) -> np.ndarray ","text":"Source code in tablite/nimlite.py def read_page(path: Union[str, Path]) -> np.ndarray:\n return nl.read_page(str(path))\n "},{"location":"reference/nimlite/#tablite.nimlite.repaginate","title":"tablite.nimlite.repaginate(column: Column) ","text":"Source code in tablite/nimlite.py def repaginate(column: Column):\n nl.repaginate(column)\n "},{"location":"reference/nimlite/#tablite.nimlite.nearest_neighbour","title":"tablite.nimlite.nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm) ","text":"Source code in tablite/nimlite.py def nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm):\n return nl.nearest_neighbour(T, sources, list(missing), targets, tqdm)\n "},{"location":"reference/nimlite/#tablite.nimlite.groupby","title":"tablite.nimlite.groupby(T, keys, functions, tqdm=_tqdm) ","text":"Source code in tablite/nimlite.py def groupby(T, keys, functions, tqdm=_tqdm):\n return nl.groupby(T, keys, functions, tqdm)\n "},{"location":"reference/nimlite/#tablite.nimlite.filter","title":"tablite.nimlite.filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm=_tqdm) ","text":"Source code in tablite/nimlite.py def filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm = _tqdm):\n return nl.filter(table, expressions, type, tqdm)\n "},{"location":"reference/pivots/","title":"Pivots","text":""},{"location":"reference/pivots/#tablite.pivots","title":"tablite.pivots ","text":""},{"location":"reference/pivots/#tablite.pivots-classes","title":"Classes","text":""},{"location":"reference/pivots/#tablite.pivots-functions","title":"Functions","text":""},{"location":"reference/pivots/#tablite.pivots.pivot","title":"tablite.pivots.pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None) ","text":"param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as example: >>> t.show()\n+=====+=====+=====+\n| A | B | C |\n| int | int | int |\n+-----+-----+-----+\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n| 1| 1| 6|\n| 1| 2| 5|\n| 2| 3| 4|\n| 2| 4| 3|\n| 3| 5| 2|\n| 3| 6| 1|\n+=====+=====+=====+\n\n>>> t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n>>> t2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int| str |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0 | 6|Sum(B) | 2|None |None |\n|1 | 5|Sum(B) | 4|None |None |\n|2 | 4|Sum(B) |None | 6|None |\n|3 | 3|Sum(B) |None | 8|None |\n|4 | 2|Sum(B) |None |None | 10|\n|5 | 1|Sum(B) |None |None | 12|\n+===+===+========+=====+=====+=====+\n Source code in tablite/pivots.py def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n \"\"\"\n param: rows: column names to keep as rows\n param: columns: column names to keep as columns\n param: functions: aggregation functions from the Groupby class as\n\n example:\n ```\n >>> t.show()\n +=====+=====+=====+\n | A | B | C |\n | int | int | int |\n +-----+-----+-----+\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n | 1| 1| 6|\n | 1| 2| 5|\n | 2| 3| 4|\n | 2| 4| 3|\n | 3| 5| 2|\n | 3| 6| 1|\n +=====+=====+=====+\n\n >>> t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n >>> t2.show()\n +===+===+========+=====+=====+=====+\n | # | C |function|(A=1)|(A=2)|(A=3)|\n |row|int| str |mixed|mixed|mixed|\n +---+---+--------+-----+-----+-----+\n |0 | 6|Sum(B) | 2|None |None |\n |1 | 5|Sum(B) | 4|None |None |\n |2 | 4|Sum(B) |None | 6|None |\n |3 | 3|Sum(B) |None | 8|None |\n |4 | 2|Sum(B) |None |None | 10|\n |5 | 1|Sum(B) |None |None | 12|\n +===+===+========+=====+=====+=====+\n ```\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if isinstance(rows, str):\n rows = [rows]\n if not all(isinstance(i, str) for i in rows):\n raise TypeError(f\"Expected rows as a list of column names, not {[i for i in rows if not isinstance(i,str)]}\")\n\n if isinstance(columns, str):\n columns = [columns]\n if not all(isinstance(i, str) for i in columns):\n raise TypeError(\n f\"Expected columns as a list of column names, not {[i for i in columns if not isinstance(i, str)]}\"\n )\n\n if not isinstance(values_as_rows, bool):\n raise TypeError(f\"expected sum_on_rows as boolean, not {type(values_as_rows)}\")\n\n keys = rows + columns\n assert isinstance(keys, list)\n\n extra_steps = 2\n\n if pbar is None:\n total = extra_steps\n\n if len(functions) == 0:\n total = total + len(keys)\n else:\n total = total + len(T)\n\n pbar = tqdm(total=total, desc=\"pivot\")\n\n grpby = groupby(T, keys, functions, tqdm=tqdm)\n Constr = type(T)\n\n if len(grpby) == 0: # return empty table. This must be a test?\n pbar.update(extra_steps)\n return Constr()\n\n # split keys to determine grid dimensions\n row_key_index = {}\n col_key_index = {}\n\n r = len(rows)\n c = len(columns)\n g = len(functions)\n\n records = defaultdict(dict)\n\n for row in grpby.rows:\n row_key = tuple(row[:r])\n col_key = tuple(row[r : r + c])\n func_key = tuple(row[r + c :])\n\n if row_key not in row_key_index:\n row_key_index[row_key] = len(row_key_index) # Y\n\n if col_key not in col_key_index:\n col_key_index[col_key] = len(col_key_index) # X\n\n rix = row_key_index[row_key]\n cix = col_key_index[col_key]\n if cix in records:\n if rix in records[cix]:\n raise ValueError(\"this should be empty.\")\n records[cix][rix] = func_key\n\n pbar.update(1)\n result = type(T)()\n\n if values_as_rows: # ---> leads to more rows.\n # first create all columns left to right\n\n n = r + 1 # rows keys + 1 col for function values.\n cols = [[] for _ in range(n)]\n for row, ix in row_key_index.items():\n for col_name, f in functions:\n cols[-1].append(f\"{f}({col_name})\")\n for col_ix, v in enumerate(row):\n cols[col_ix].append(v)\n\n for col_name, values in zip(rows + [\"function\"], cols):\n col_name = unique_name(col_name, result.columns)\n result[col_name] = values\n col_length = len(cols[0])\n cols.clear()\n\n # then populate the sparse matrix.\n for col_key, c in col_key_index.items():\n col_name = \"(\" + \",\".join([f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)]) + \")\"\n col_name = unique_name(col_name, result.columns)\n L = [None for _ in range(col_length)]\n for r, funcs in records[c].items():\n for ix, f in enumerate(funcs):\n L[g * r + ix] = f\n result[col_name] = L\n\n else: # ---> leads to more columns.\n n = r\n cols = [[] for _ in range(n)]\n for row in row_key_index:\n for col_ix, v in enumerate(row):\n cols[col_ix].append(v) # write key columns.\n\n for col_name, values in zip(rows, cols):\n result[col_name] = values\n\n col_length = len(row_key_index)\n\n # now populate the sparse matrix.\n for col_key, c in col_key_index.items(): # select column.\n cols, names = [], []\n\n for f, v in zip(functions, func_key):\n agg_col, func = f\n terms = \",\".join([agg_col] + [f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)])\n col_name = f\"{func}({terms})\"\n col_name = unique_name(col_name, result.columns)\n names.append(col_name)\n cols.append([None for _ in range(col_length)])\n for r, funcs in records[c].items():\n for ix, f in enumerate(funcs):\n cols[ix][r] = f\n for name, col in zip(names, cols):\n result[name] = col\n\n pbar.update(1)\n\n return result\n "},{"location":"reference/pivots/#tablite.pivots.transpose","title":"tablite.pivots.transpose(T, tqdm=_tqdm) ","text":"performs a CCW matrix rotation of the table. Source code in tablite/pivots.py def transpose(T, tqdm=_tqdm):\n \"\"\"performs a CCW matrix rotation of the table.\"\"\"\n sub_cls_check(T, BaseTable)\n\n if len(T.columns) == 0:\n return type(T)()\n\n assert isinstance(T, BaseTable)\n new = type(T)()\n L = list(T.columns)\n new[L[0]] = L[1:]\n for row in tqdm(T.rows, desc=\"table transpose\", total=len(T)):\n new[row[0]] = row[1:]\n return new\n "},{"location":"reference/pivots/#tablite.pivots.pivot_transpose","title":"tablite.pivots.pivot_transpose(T, columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm) ","text":"Transpose a selection of columns to rows. PARAMETER DESCRIPTION columns column names to transpose TYPE: list of column names keep column names to keep (repeat) TYPE: list of column names DEFAULT: None RETURNS DESCRIPTION Table with columns transposed to rows Example transpose columns 1,2 and 3 and transpose the remaining columns, except sum . Input: | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n| 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n| ... | | | | | | | | |\n\n>>> t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun | 456|\n|1234| 2345| 3456| mon | 567|\n|1244| 2445| 4456| mon | 7|\n Source code in tablite/pivots.py def pivot_transpose(T, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n \"\"\"Transpose a selection of columns to rows.\n\n Args:\n columns (list of column names): column names to transpose\n keep (list of column names): column names to keep (repeat)\n\n Returns:\n Table: with columns transposed to rows\n\n Example:\n transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n Input:\n ```\n | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum |\n |------|------|------|-----|-----|-----|-----|-----|------|\n | 1234 | 2345 | 3456 | 456 | 567 | | ... | | 1023 |\n | 1244 | 2445 | 4456 | | 7 | | ... | | 7 |\n | ... | | | | | | | | |\n\n >>> t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n Output:\n |col1| col2| col3| transpose| value|\n |----|-----|-----|----------|------|\n |1234| 2345| 3456| sun | 456|\n |1234| 2345| 3456| mon | 567|\n |1244| 2445| 4456| mon | 7|\n ```\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if not isinstance(columns, list):\n raise TypeError\n\n for i in columns:\n if not isinstance(i, str):\n raise TypeError\n if i not in T.columns:\n raise ValueError\n if columns.count(i)>1:\n raise ValueError(f\"Column {i} appears more than once\")\n\n if keep is None:\n keep = []\n for i in keep:\n if not isinstance(i, str):\n raise TypeError\n if i not in T.columns:\n raise ValueError\n\n if column_name in keep + columns:\n column_name = unique_name(column_name, set_of_names=keep + columns)\n if value_name in keep + columns + [column_name]:\n value_name = unique_name(value_name, set_of_names=keep + columns)\n\n new = type(T)()\n new.add_columns(*keep + [column_name, value_name])\n news = {name: [] for name in new.columns}\n\n n = len(keep)\n\n with tqdm(total=len(T), desc=\"transpose\", disable=Config.TQDM_DISABLE) as pbar:\n it = T[keep + columns].rows if len(keep + columns) > 1 else ((v, ) for v in T[keep + columns])\n\n for ix, row in enumerate(it, start=1):\n keeps = row[:n]\n transposes = row[n:]\n\n for name, value in zip(keep, keeps):\n news[name].extend([value] * len(transposes))\n for name, value in zip(columns, transposes):\n news[column_name].append(name)\n news[value_name].append(value)\n\n if ix % Config.SINGLE_PROCESSING_LIMIT == 0:\n for name, values in news.items():\n new[name].extend(values)\n values.clear()\n\n pbar.update(1)\n\n for name, values in news.items():\n new[name].extend(np.array(values))\n values.clear()\n return new\n "},{"location":"reference/redux/","title":"Redux","text":""},{"location":"reference/redux/#tablite.redux","title":"tablite.redux ","text":""},{"location":"reference/redux/#tablite.redux-attributes","title":"Attributes","text":""},{"location":"reference/redux/#tablite.redux-classes","title":"Classes","text":""},{"location":"reference/redux/#tablite.redux-functions","title":"Functions","text":""},{"location":"reference/redux/#tablite.redux.filter_all","title":"tablite.redux.filter_all(T, **kwargs) ","text":"returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable Examples: t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n return x == 4\ndef g(x):\n return x < 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n return x>=2\n\ndef i(x):\n return x<=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n Source code in tablite/redux.py def filter_all(T, **kwargs):\n \"\"\"\n returns Table for rows where ALL kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n\n Examples:\n\n t = Table()\n t['a'] = [1,2,3,4]\n t['b'] = [10,20,30,40]\n\n def f(x):\n return x == 4\n def g(x):\n return x < 20\n\n t2 = t.any( **{\"a\":f, \"b\":g})\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n t2 = t.any(a=f,b=g)\n assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n def h(x):\n return x>=2\n\n def i(x):\n return x<=30\n\n t2 = t.all(a=h,b=i)\n assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n if not isinstance(kwargs, dict):\n raise TypeError(\"did you forget to add the ** in front of your dict?\")\n if not all([k in T.columns for k in kwargs]):\n raise ValueError(f\"Unknown column(s): {[k for k in kwargs if k not in T.columns]}\")\n\n mask = np.full((len(T),), True)\n for k, v in kwargs.items():\n col = T[k]\n for start, end, page in col.iter_by_page():\n data = page.get()\n if callable(v):\n vf = np.frompyfunc(v, 1, 1)\n mask[start:end] = mask[start:end] & np.apply_along_axis(vf, 0, data)\n else:\n mask[start:end] = mask[start:end] & (data == v)\n\n return _compress_one(T, mask)\n "},{"location":"reference/redux/#tablite.redux.drop","title":"tablite.redux.drop(T, *args) ","text":"drops all rows that contain args PARAMETER DESCRIPTION T TYPE: Table Source code in tablite/redux.py def drop(T, *args):\n \"\"\"drops all rows that contain args\n\n Args:\n T (Table):\n \"\"\"\n sub_cls_check(T, BaseTable)\n mask = np.full((len(T),), False)\n for name in T.columns:\n col = T[name]\n for start, end, page in col.iter_by_page():\n data = page.get()\n for arg in args:\n mask[start:end] = mask[start:end] | (data == arg)\n\n mask = np.invert(mask)\n return _compress_one(T, mask)\n "},{"location":"reference/redux/#tablite.redux.filter_any","title":"tablite.redux.filter_any(T, **kwargs) ","text":"returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable Source code in tablite/redux.py def filter_any(T, **kwargs):\n \"\"\"\n returns Table for rows where ANY kwargs match\n :param kwargs: dictionary with headers and values / boolean callable\n \"\"\"\n sub_cls_check(T, BaseTable)\n if not isinstance(kwargs, dict):\n raise TypeError(\"did you forget to add the ** in front of your dict?\")\n\n mask = np.full((len(T),), False)\n for k, v in kwargs.items():\n col = T[k]\n for start, end, page in col.iter_by_page():\n data = page.get()\n if callable(v):\n vf = np.frompyfunc(v, 1, 1)\n mask[start:end] = mask[start:end] | np.apply_along_axis(vf, 0, data)\n else:\n mask[start:end] = mask[start:end] | (v == data)\n\n return _compress_one(T, mask)\n "},{"location":"reference/redux/#tablite.redux.compress_both","title":"tablite.redux.compress_both(T, mask, pbar: _tqdm) ","text":"Source code in tablite/redux.py def compress_both(T, mask, pbar: _tqdm):\n # NOTE FOR DEVELOPERS:\n # np.compress is so fast that the overhead of multiprocessing doesn't pay off.\n cls = type(T)\n true, false = cls(), cls()\n\n pbar_div = (len(T.columns) * len(list(Config.page_steps(len(T)))) - 1)\n pbar_step = (10 / pbar_div) if pbar_div != 0 else 0\n\n for name in T.columns:\n true.add_column(name)\n false.add_column(name)\n true_col = true[name] # fetch the col to avoid doing it in the loop below\n false_col = false[name]\n # prevent OOMError by slicing the getitem ops\n for start, end in Config.page_steps(len(T)):\n data = T[name][start:end]\n true_col.extend(np.compress(mask[start:end], data))\n false_col.extend(np.compress(np.invert(mask)[start:end], data))\n if pbar is not None:\n pbar.update(pbar_step)\n return true, false\n "},{"location":"reference/redux/#tablite.redux.get_filter_bitmap","title":"tablite.redux.get_filter_bitmap(T, expressions, pbar: _tqdm) ","text":"Source code in tablite/redux.py def get_filter_bitmap(T, expressions, pbar: _tqdm):\n for expression in expressions:\n if not isinstance(expression, dict):\n raise TypeError(f\"invalid expression: {expression}\")\n if not len(expression) == 3:\n raise ValueError(f\"expected 3 items, got {expression}\")\n x = {\"column1\", \"column2\", \"criteria\", \"value1\", \"value2\"}\n if not set(expression.keys()).issubset(x):\n raise ValueError(f\"got unknown key: {set(expression.keys()).difference(x)}\")\n\n if expression[\"criteria\"] not in filter_ops:\n raise ValueError(f\"criteria missing from {expression}\")\n\n c1 = expression.get(\"column1\", None)\n if c1 is not None and c1 not in T.columns:\n raise ValueError(f\"no such column: {c1}\")\n\n v1 = expression.get(\"value1\", None)\n if v1 is not None and c1 is not None:\n raise ValueError(\"filter can only take 1 left expr element. Got 2.\")\n\n c2 = expression.get(\"column2\", None)\n if c2 is not None and c2 not in T.columns:\n raise ValueError(f\"no such column: {c2}\")\n\n v2 = expression.get(\"value2\", None)\n if v2 is not None and c2 is not None:\n raise ValueError(\"filter can only take 1 right expression element. Got 2.\")\n\n # EVALUATION....\n # 1. setup a rectangular bitmap for evaluations\n bitmap = np.empty(shape=(len(expressions), len(T)), dtype=bool)\n pbar_div = (len(expressions) * len(list(Config.page_steps(len(T)))) - 1)\n pbar_step = (10 / pbar_div) if pbar_div != 0 else 0\n # 2. create tasks for evaluations\n for bit_index, expression in enumerate(expressions):\n assert isinstance(expression, dict)\n assert len(expression) == 3\n c1 = expression.get(\"column1\", None)\n c2 = expression.get(\"column2\", None)\n expr = expression.get(\"criteria\", None)\n assert expr in filter_ops\n v1 = expression.get(\"value1\", None)\n v2 = expression.get(\"value2\", None)\n\n for start, end in Config.page_steps(len(T)):\n if c1 is not None:\n dset_A = T[c1][start:end]\n else: # v1 is active:\n dset_A = np.array([v1] * (end - start))\n\n if c2 is not None:\n dset_B = T[c2][start:end]\n else: # v2 is active:\n dset_B = np.array([v2] * (end - start))\n\n if len(dset_A) != len(dset_B):\n raise ValueError(\n f\"Assymmetric dataset: {c1} has {len(dset_A)} values, whilst {c2} has {len(dset_B)} values.\"\n )\n # Evaluate\n try:\n if expr == \">\":\n result = dset_A > dset_B\n elif expr == \">=\":\n result = dset_A >= dset_B\n elif expr == \"==\":\n result = dset_A == dset_B\n elif expr == \"<\":\n result = dset_A < dset_B\n elif expr == \"<=\":\n result = dset_A <= dset_B\n elif expr == \"!=\":\n result = dset_A != dset_B\n else: # it's a python evaluations (slow)\n f = filter_ops.get(expr)\n assert callable(f)\n result = list_to_np_array([f(a, b) for a, b in zip(dset_A, dset_B)])\n except TypeError:\n def safe_test(f, a, b):\n try:\n return f(a, b)\n except TypeError:\n return False\n f = filter_ops.get(expr)\n assert callable(f)\n result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)])\n bitmap[bit_index, start:end] = result\n if pbar is not None:\n pbar.update(pbar_step)\n\n return bitmap\n "},{"location":"reference/redux/#tablite.redux.filter_non_primitive","title":"tablite.redux.filter_non_primitive(T, expressions, filter_type='all', tqdm=_tqdm) ","text":"OBSOLETE filters table PARAMETER DESCRIPTION T Table. TYPE: Table subclass expressions str: filters based on an expression, such as: \"all((A==B, C!=4, 200<D))\" which is interpreted using python's compiler to: def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n list of dicts: (example): L = [ {'column1':'A', 'criteria': \"==\", 'column2': 'B'}, {'column1':'C', 'criteria': \"!=\", \"value2\": '4'}, {'value1': 200, 'criteria': \"<\", column2: 'D' } ] TYPE: list or str accepted 'column1', 'column2', 'criteria', 'value1', 'value2' TYPE: dictionary keys filter_type Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\". TYPE: str DEFAULT: 'all' tqdm progressbar. Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION 2xTables trues, falses Source code in tablite/redux.py def filter_non_primitive(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n \"\"\"\n OBSOLETE\n filters table\n\n\n Args:\n T (Table subclass): Table.\n expressions (list or str):\n str:\n filters based on an expression, such as:\n \"all((A==B, C!=4, 200<D))\"\n which is interpreted using python's compiler to:\n\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n\n list of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\n accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n filter_type (str, optional): Ignored if expressions is str.\n 'all' or 'any'. Defaults to \"all\".\n tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n Returns:\n 2xTables: trues, falses\n \"\"\"\n # determine method\n warnings.warn(\"Filter using non-primitive types is not recommended.\")\n sub_cls_check(T, BaseTable)\n if len(T) == 0:\n return T.copy(), T.copy()\n\n with tqdm(desc=\"filter\", total=20) as pbar:\n if isinstance(expressions, str):\n mask = _filter_using_expression(T, expressions)\n pbar.update(10)\n elif isinstance(expressions, list):\n mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar)\n else:\n raise TypeError\n # create new tables\n res = compress_both(T, mask, pbar=pbar)\n pbar.update(pbar.total - pbar.n)\n\n return res\n "},{"location":"reference/redux/#tablite.redux.filter","title":"tablite.redux.filter(T, expressions, filter_type='all', tqdm=_tqdm) ","text":"filters table Note: At the moment only tablite primitive types are supported PARAMETER DESCRIPTION T Table. TYPE: Table subclass expressions str: filters based on an expression, such as: \"all((A==B, C!=4, 200<D))\" which is interpreted using python's compiler to: def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n list of dicts: (example): L = [ {'column1':'A', 'criteria': \"==\", 'column2': 'B'}, {'column1':'C', 'criteria': \"!=\", \"value2\": '4'}, {'value1': 200, 'criteria': \"<\", column2: 'D' } ] TYPE: list or str accepted 'column1', 'column2', 'criteria', 'value1', 'value2' TYPE: dictionary keys filter_type Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\". TYPE: str DEFAULT: 'all' tqdm progressbar. Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm RETURNS DESCRIPTION 2xTables trues, falses Source code in tablite/redux.py def filter(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n \"\"\"filters table\n Note: At the moment only tablite primitive types are supported\n\n Args:\n T (Table subclass): Table.\n expressions (list or str):\n str:\n filters based on an expression, such as:\n \"all((A==B, C!=4, 200<D))\"\n which is interpreted using python's compiler to:\n\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n\n list of dicts: (example):\n\n L = [\n {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n {'value1': 200, 'criteria': \"<\", column2: 'D' }\n ]\n\n accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n filter_type (str, optional): Ignored if expressions is str.\n 'all' or 'any'. Defaults to \"all\".\n tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n Returns:\n 2xTables: trues, falses\n \"\"\"\n # determine method\n sub_cls_check(T, BaseTable)\n if len(T) == 0:\n return T.copy(), T.copy()\n\n if isinstance(expressions, str):\n with tqdm(desc=\"filter\", total=20) as pbar:\n # TODO: make parser for expressions and use the nim implement\n mask = _filter_using_expression(T, expressions)\n pbar.update(10)\n res = compress_both(T, mask, pbar=pbar)\n pbar.update(pbar.total - pbar.n)\n elif isinstance(expressions, list):\n return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)\n else:\n raise TypeError\n # create new tables\n\n return res\n "},{"location":"reference/reindex/","title":"Reindex","text":""},{"location":"reference/reindex/#tablite.reindex","title":"tablite.reindex ","text":""},{"location":"reference/reindex/#tablite.reindex-classes","title":"Classes","text":""},{"location":"reference/reindex/#tablite.reindex-functions","title":"Functions","text":""},{"location":"reference/reindex/#tablite.reindex.reindex","title":"tablite.reindex.reindex(T, index, names=None, tqdm=_tqdm, pbar=None) ","text":"Constant Memory helper for reindexing pages. Memory usage is set by datatype and Config.PAGE_SIZE PARAMETER DESCRIPTION T subclass of Table TYPE: Table index int64. TYPE: array names list of names from T to reindex. TYPE: (list, str) DEFAULT: None tqdm Defaults to _tqdm. TYPE: tqdm DEFAULT: tqdm pbar Defaults to None. TYPE: pbar DEFAULT: None RETURNS DESCRIPTION _type_ description Source code in tablite/reindex.py def reindex(T, index, names=None, tqdm=_tqdm, pbar=None):\n \"\"\"Constant Memory helper for reindexing pages.\n\n Memory usage is set by datatype and Config.PAGE_SIZE\n\n Args:\n T (Table): subclass of Table\n index (np.array): int64.\n names (list, str): list of names from T to reindex.\n tqdm (tqdm, optional): Defaults to _tqdm.\n pbar (pbar, optional): Defaults to None.\n\n Returns:\n _type_: _description_\n \"\"\"\n if names is None:\n names = list(T.columns.keys())\n\n if pbar is None:\n total = len(names)\n pbar = tqdm(total=total, desc=\"join\", disable=Config.TQDM_DISABLE)\n\n sub_cls_check(T, BaseTable)\n cls = type(T)\n result = cls()\n for name in names:\n result.add_column(name)\n col = result[name]\n\n for start, end in Config.page_steps(len(index)):\n indices = index[start:end]\n values = T[name].get_by_indices(indices)\n # in these values, the index of -1 will be wrong.\n # so if there is any -1 in the indices, they will\n # have to be replaced with Nones\n mask = indices == -1\n if np.any(mask):\n nones = np.full(index.shape, fill_value=None)\n values = np.where(mask, nones, values)\n col.extend(values)\n pbar.update(1)\n\n return result\n "},{"location":"reference/sort_utils/","title":"Sort utils","text":""},{"location":"reference/sort_utils/#tablite.sort_utils","title":"tablite.sort_utils ","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-attributes","title":"Attributes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.uca_collator","title":"tablite.sort_utils.uca_collator = Collator() module-attribute ","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.modes","title":"tablite.sort_utils.modes = {'alphanumeric': text_sort, 'unix': unix_sort, 'excel': excel_sort} module-attribute ","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-classes","title":"Classes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict","title":"tablite.sort_utils.HashDict ","text":" Bases: dict This class is just a nicity syntatic sugar for debugging. Function identically to regular dictionary, just uses tupled key. "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.items","title":"tablite.sort_utils.HashDict.items() ","text":"Source code in tablite/sort_utils.py def items(self):\n return [(k, v) for (_, k), v in super().items()]\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.keys","title":"tablite.sort_utils.HashDict.keys() ","text":"Source code in tablite/sort_utils.py def keys(self):\n return [k for (_, k) in super().keys()]\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__iter__","title":"tablite.sort_utils.HashDict.__iter__() -> Iterator ","text":"Source code in tablite/sort_utils.py def __iter__(self) -> Iterator:\n return (k for (_, k) in super().keys())\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__getitem__","title":"tablite.sort_utils.HashDict.__getitem__(key) ","text":"Source code in tablite/sort_utils.py def __getitem__(self, key):\n return super().__getitem__(self._get_hash(key))\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__setitem__","title":"tablite.sort_utils.HashDict.__setitem__(key, value) ","text":"Source code in tablite/sort_utils.py def __setitem__(self, key, value):\n return super().__setitem__(self._get_hash(key), value)\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__contains__","title":"tablite.sort_utils.HashDict.__contains__(key) -> bool ","text":"Source code in tablite/sort_utils.py def __contains__(self, key) -> bool:\n return super().__contains__(self._get_hash(key))\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__delitem__","title":"tablite.sort_utils.HashDict.__delitem__(key) ","text":"Source code in tablite/sort_utils.py def __delitem__(self, key):\n return super().__delitem__(self._get_hash(key))\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__repr__","title":"tablite.sort_utils.HashDict.__repr__() -> str ","text":"Source code in tablite/sort_utils.py def __repr__(self) -> str:\n return '{' + \", \".join([f\"{k}: {v}\" for k, v in self.items()]) + '}'\n "},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__str__","title":"tablite.sort_utils.HashDict.__str__() -> str ","text":"Source code in tablite/sort_utils.py def __str__(self) -> str:\n return repr(self)\n "},{"location":"reference/sort_utils/#tablite.sort_utils-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.text_sort","title":"tablite.sort_utils.text_sort(values, reverse=False) ","text":"Sorts everything as text. Source code in tablite/sort_utils.py def text_sort(values, reverse=False):\n \"\"\"\n Sorts everything as text.\n \"\"\"\n text = {str(i): i for i in values}\n L = list(text.keys())\n L.sort(key=uca_collator.sort_key, reverse=reverse)\n d = {text[value]: ix for ix, value in enumerate(L)}\n return d\n "},{"location":"reference/sort_utils/#tablite.sort_utils.unix_sort","title":"tablite.sort_utils.unix_sort(values, reverse=False) ","text":"Unix sortation sorts by the following order: | rank | type | value | +------+-----------+--------------------------------------------+ | 0 | None | floating point -infinite | | 1 | bool | 0 as False, 1 as True | | 2 | int | as numeric value | | 2 | float | as numeric value | | 3 | time | \u03c4 * seconds into the day / (24 * 60 * 60) | | 4 | date | as integer days since 1970/1/1 | | 5 | datetime | as float using date (int) + time (decimal) | | 6 | timedelta | as float using date (int) + time (decimal) | | 7 | str | using unicode | +------+-----------+--------------------------------------------+ \u03c4 = 2 * \u03c0 Source code in tablite/sort_utils.py def unix_sort(values, reverse=False):\n \"\"\"\n Unix sortation sorts by the following order:\n\n | rank | type | value |\n +------+-----------+--------------------------------------------+\n | 0 | None | floating point -infinite |\n | 1 | bool | 0 as False, 1 as True |\n | 2 | int | as numeric value |\n | 2 | float | as numeric value |\n | 3 | time | \u03c4 * seconds into the day / (24 * 60 * 60) |\n | 4 | date | as integer days since 1970/1/1 |\n | 5 | datetime | as float using date (int) + time (decimal) |\n | 6 | timedelta | as float using date (int) + time (decimal) |\n | 7 | str | using unicode |\n +------+-----------+--------------------------------------------+\n\n \u03c4 = 2 * \u03c0\n\n \"\"\"\n text, non_text = [], []\n\n # L = []\n # text = [i for i in values if isinstance(i, str)]\n # text.sort(key=uca_collator.sort_key, reverse=reverse)\n # text_code = _unix_typecodes[str]\n # L = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n for value in values:\n if isinstance(value, str):\n text.append(value)\n else:\n t = type(value)\n TC = _unix_typecodes[t]\n tf = _unix_value_function[t]\n VC = tf(value)\n non_text.append((TC, VC, value))\n non_text.sort(reverse=reverse)\n\n text.sort(key=uca_collator.sort_key, reverse=reverse)\n text_code = _unix_typecodes[str]\n text = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n d = HashDict()\n L = non_text + text\n for ix, (_, _, value) in enumerate(L):\n d[value] = ix\n return d\n "},{"location":"reference/sort_utils/#tablite.sort_utils.excel_sort","title":"tablite.sort_utils.excel_sort(values, reverse=False) ","text":"Excel sortation sorts by the following order: | rank | type | value | +------+-----------+--------------------------------------------+ | 1 | int | as numeric value | | 1 | float | as numeric value | | 1 | time | as seconds into the day / (24 * 60 * 60) | | 1 | date | as integer days since 1900/1/1 | | 1 | datetime | as float using date (int) + time (decimal) | | (1)*| timedelta | as float using date (int) + time (decimal) | | 2 | str | using unicode | | 3 | bool | 0 as False, 1 as True | | 4 | None | floating point infinite. | +------+-----------+--------------------------------------------+ - Excel doesn't have timedelta.
Source code in tablite/sort_utils.py def excel_sort(values, reverse=False):\n \"\"\"\n Excel sortation sorts by the following order:\n\n | rank | type | value |\n +------+-----------+--------------------------------------------+\n | 1 | int | as numeric value |\n | 1 | float | as numeric value |\n | 1 | time | as seconds into the day / (24 * 60 * 60) |\n | 1 | date | as integer days since 1900/1/1 |\n | 1 | datetime | as float using date (int) + time (decimal) |\n | (1)*| timedelta | as float using date (int) + time (decimal) |\n | 2 | str | using unicode |\n | 3 | bool | 0 as False, 1 as True |\n | 4 | None | floating point infinite. |\n +------+-----------+--------------------------------------------+\n\n * Excel doesn't have timedelta.\n \"\"\"\n\n def tup(TC, value):\n return (TC, _excel_value_function[t](value), value)\n\n text, numeric, booles, nones = [], [], [], []\n for value in values:\n t = type(value)\n TC = _excel_typecodes[t]\n\n if TC == 0:\n numeric.append(tup(TC, value))\n elif TC == 1:\n text.append(value) # text is processed later.\n elif TC == 2:\n booles.append(tup(TC, value))\n elif TC == 3:\n booles.append(tup(TC, value))\n else:\n raise TypeError(f\"no typecode for {value}\")\n\n if text:\n text.sort(key=uca_collator.sort_key, reverse=reverse)\n text = [(2, ix, v) for ix, v in enumerate(text)]\n\n numeric.sort(reverse=reverse)\n booles.sort(reverse=reverse)\n nones.sort(reverse=reverse)\n\n if reverse:\n L = nones + booles + text + numeric\n else:\n L = numeric + text + booles + nones\n d = {value: ix for ix, (_, _, value) in enumerate(L)}\n return d\n "},{"location":"reference/sort_utils/#tablite.sort_utils.rank","title":"tablite.sort_utils.rank(values, reverse, mode) ","text":"values: list of values to sort. reverse: bool mode: as 'text', as 'numeric' or as 'excel' return: dict: d[value] = rank Source code in tablite/sort_utils.py def rank(values, reverse, mode):\n \"\"\"\n values: list of values to sort.\n reverse: bool\n mode: as 'text', as 'numeric' or as 'excel'\n return: dict: d[value] = rank\n \"\"\"\n if mode not in modes:\n raise ValueError(f\"{mode} not in list of modes: {list(modes)}\")\n f = modes.get(mode)\n return f(values, reverse)\n "},{"location":"reference/sortation/","title":"Sortation","text":""},{"location":"reference/sortation/#tablite.sortation","title":"tablite.sortation ","text":""},{"location":"reference/sortation/#tablite.sortation-attributes","title":"Attributes","text":""},{"location":"reference/sortation/#tablite.sortation-classes","title":"Classes","text":""},{"location":"reference/sortation/#tablite.sortation-functions","title":"Functions","text":""},{"location":"reference/sortation/#tablite.sortation.sort_index","title":"tablite.sortation.sort_index(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar=None) ","text":"helper for methods sort and is_sorted param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort() Source code in tablite/sortation.py def sort_index(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar=None):\n \"\"\"\n helper for methods `sort` and `is_sorted`\n\n param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n param: **kwargs: sort criteria. See Table.sort()\n \"\"\"\n\n sub_cls_check(T, BaseTable)\n\n if not isinstance(mapping, dict) or not mapping:\n raise TypeError(\"Expected mapping (dict)?\")\n\n for k, v in mapping.items():\n if k not in T.columns:\n raise ValueError(f\"no column {k}\")\n if not isinstance(v, bool):\n raise ValueError(f\"{k} was mapped to {v} - a non-boolean\")\n\n if sort_mode not in sort_modes:\n raise ValueError(f\"{sort_mode} not in list of sort_modes: {list(sort_modes)}\")\n\n rank = {i: tuple() for i in range(len(T))} # create index and empty tuple for sortation.\n\n _pbar = tqdm(total=len(mapping.items()), desc=\"creating sort index\") if pbar is None else pbar\n\n for key, reverse in mapping.items():\n col = T[key][:]\n ranks = sort_rank(values=[numpy_to_python(v) for v in multitype_set(col)], reverse=reverse, mode=sort_mode)\n assert isinstance(ranks, dict)\n for ix, v in enumerate(col):\n v2 = numpy_to_python(v)\n rank[ix] += (ranks[v2],) # add tuple for each sortation level.\n\n _pbar.update(1)\n\n del col\n del ranks\n\n new_order = [(r, i) for i, r in rank.items()] # tuples are listed and sort...\n del rank # free memory.\n\n new_order.sort()\n sorted_index = [i for _, i in new_order] # new index is extracted.\n new_order.clear()\n return np.array(sorted_index, dtype=np.int64)\n "},{"location":"reference/sortation/#tablite.sortation.reindex","title":"tablite.sortation.reindex(T, index) ","text":"index: list of integers that declare sort order. Examples: Table: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable: ['a','b','c','d','e','f','g','h']\nindex: [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n Source code in tablite/sortation.py def reindex(T, index):\n \"\"\"\n index: list of integers that declare sort order.\n\n Examples:\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6]\n result: ['b','d','f','h']\n\n Table: ['a','b','c','d','e','f','g','h']\n index: [0,2,4,6,1,3,5,7]\n result: ['a','c','e','g','b','d','f','h']\n\n \"\"\"\n sub_cls_check(T, BaseTable)\n if isinstance(index, list):\n index = np.array(index, dtype=int)\n type_check(index, np.ndarray)\n if max(index) >= len(T):\n raise IndexError(\"index out of range: max(index) > len(self)\")\n if min(index) < -len(T):\n raise IndexError(\"index out of range: min(index) < -len(self)\")\n\n fields = len(T) * len(T.columns)\n m = select_processing_method(fields, _reindex, _mp_reindex)\n return m(T, index)\n "},{"location":"reference/sortation/#tablite.sortation.sort","title":"tablite.sortation.sort(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None) ","text":"Perform multi-pass sorting with precedence given order of column names. sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" kwargs: keys: columns, values: 'reverse' as boolean. examples: Table.sort('A'=False) means sort by 'A' in ascending order. Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority) sort B in ascending order. Source code in tablite/sortation.py def sort(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n \"\"\"Perform multi-pass sorting with precedence given order of column names.\n sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n kwargs:\n keys: columns,\n values: 'reverse' as boolean.\n\n examples:\n Table.sort('A'=False) means sort by 'A' in ascending order.\n Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority)\n sort B in ascending order.\n \"\"\"\n sub_cls_check(T, BaseTable)\n\n index = sort_index(T, mapping, sort_mode=sort_mode, tqdm=_tqdm, pbar=pbar)\n m = select_processing_method(len(T) * len(T.columns), _sp_reindex, _mp_reindex)\n return m(T, index, tqdm=tqdm, pbar=pbar)\n "},{"location":"reference/sortation/#tablite.sortation.is_sorted","title":"tablite.sortation.is_sorted(T, mapping, sort_mode='excel') ","text":"Performs multi-pass sorting check with precedence given order of column names. PARAMETER DESCRIPTION mapping sort criteria. See Table.sort() RETURNS DESCRIPTION bool Source code in tablite/sortation.py def is_sorted(T, mapping, sort_mode=\"excel\"):\n \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n\n Args:\n mapping: sort criteria. See Table.sort()\n sort_mode = sort mode. See Table.sort()\n\n Returns:\n bool\n \"\"\"\n index = sort_index(T, mapping, sort_mode=sort_mode)\n match = np.arange(len(T))\n return np.all(index == match)\n "},{"location":"reference/tools/","title":"Tools","text":""},{"location":"reference/tools/#tablite.tools","title":"tablite.tools ","text":""},{"location":"reference/tools/#tablite.tools-attributes","title":"Attributes","text":""},{"location":"reference/tools/#tablite.tools.guess","title":"tablite.tools.guess = DataTypes.guess module-attribute ","text":""},{"location":"reference/tools/#tablite.tools.xround","title":"tablite.tools.xround = DataTypes.round module-attribute ","text":""},{"location":"reference/tools/#tablite.tools-classes","title":"Classes","text":""},{"location":"reference/tools/#tablite.tools-functions","title":"Functions","text":""},{"location":"reference/tools/#tablite.tools.head","title":"tablite.tools.head(path, linecount=5, delimiter=None) ","text":"Gets the head of any supported file format. Source code in tablite/tools.py def head(path, linecount=5, delimiter=None):\n \"\"\"\n Gets the head of any supported file format.\n \"\"\"\n return get_headers(path, linecount=linecount, delimiter=delimiter)\n "},{"location":"reference/utils/","title":"Utils","text":""},{"location":"reference/utils/#tablite.utils","title":"tablite.utils ","text":""},{"location":"reference/utils/#tablite.utils-attributes","title":"Attributes","text":""},{"location":"reference/utils/#tablite.utils.letters","title":"tablite.utils.letters = string.ascii_lowercase + string.digits module-attribute ","text":""},{"location":"reference/utils/#tablite.utils.NoneType","title":"tablite.utils.NoneType = type(None) module-attribute ","text":""},{"location":"reference/utils/#tablite.utils.required_keys","title":"tablite.utils.required_keys = {'min', 'max', 'mean', 'median', 'stdev', 'mode', 'distinct', 'iqr_low', 'iqr_high', 'iqr', 'sum', 'summary type', 'histogram'} module-attribute ","text":""},{"location":"reference/utils/#tablite.utils.summary_methods","title":"tablite.utils.summary_methods = {bool: _boolean_statistics_summary, int: _numeric_statistics_summary, float: _numeric_statistics_summary, str: _string_statistics_summary, date: _date_statistics_summary, datetime: _datetime_statistics_summary, time: _time_statistics_summary, timedelta: _timedelta_statistics_summary, type(None): _none_type_summary} module-attribute ","text":""},{"location":"reference/utils/#tablite.utils-classes","title":"Classes","text":""},{"location":"reference/utils/#tablite.utils-functions","title":"Functions","text":""},{"location":"reference/utils/#tablite.utils.generate_random_string","title":"tablite.utils.generate_random_string(len) ","text":"Source code in tablite/utils.py def generate_random_string(len):\n return \"\".join(random.choice(letters) for i in range(len))\n "},{"location":"reference/utils/#tablite.utils.type_check","title":"tablite.utils.type_check(var, kind) ","text":"Source code in tablite/utils.py def type_check(var, kind):\n if not isinstance(var, kind):\n raise TypeError(f\"Expected {kind}, not {type(var)}\")\n "},{"location":"reference/utils/#tablite.utils.sub_cls_check","title":"tablite.utils.sub_cls_check(c, kind) ","text":"Source code in tablite/utils.py def sub_cls_check(c, kind):\n if not issubclass(type(c), kind):\n raise TypeError(f\"Expected {kind}, not {type(c)}\")\n "},{"location":"reference/utils/#tablite.utils.name_check","title":"tablite.utils.name_check(options, *names) ","text":"Source code in tablite/utils.py def name_check(options, *names):\n for n in names:\n if n not in options:\n raise ValueError(f\"{n} not in {options}\")\n "},{"location":"reference/utils/#tablite.utils.unique_name","title":"tablite.utils.unique_name(wanted_name, set_of_names) ","text":"returns a wanted_name as wanted_name_i given a list of names which guarantees unique naming. Source code in tablite/utils.py def unique_name(wanted_name, set_of_names):\n \"\"\"\n returns a wanted_name as wanted_name_i given a list of names\n which guarantees unique naming.\n \"\"\"\n if not isinstance(set_of_names, set):\n set_of_names = set(set_of_names)\n name, i = wanted_name, 1\n while name in set_of_names:\n name = f\"{wanted_name}_{i}\"\n i += 1\n return name\n "},{"location":"reference/utils/#tablite.utils.expression_interpreter","title":"tablite.utils.expression_interpreter(expression, columns) ","text":"Interprets valid expressions such as: \"all((A==B, C!=4, 200<D))\"\n as def _f(A,B,C,D): return all((A==B, C!=4, 200<D)) using python's compiler. Source code in tablite/utils.py def expression_interpreter(expression, columns):\n \"\"\"\n Interprets valid expressions such as:\n\n \"all((A==B, C!=4, 200<D))\"\n\n as:\n def _f(A,B,C,D):\n return all((A==B, C!=4, 200<D))\n\n using python's compiler.\n \"\"\"\n if not isinstance(expression, str):\n raise TypeError(f\"`{expression}` is not a str\")\n if not isinstance(columns, list):\n raise TypeError\n if not all(isinstance(i, str) for i in columns):\n raise TypeError\n\n req_columns = \", \".join(i for i in columns if i in expression)\n script = f\"def f({req_columns}):\\n return {expression}\"\n tree = ast.parse(script)\n code = compile(tree, filename=\"blah\", mode=\"exec\")\n namespace = {}\n exec(code, namespace)\n f = namespace[\"f\"]\n if not callable(f):\n raise ValueError(f\"The expression could not be parse: {expression}\")\n return f\n "},{"location":"reference/utils/#tablite.utils.intercept","title":"tablite.utils.intercept(A, B) ","text":"Enables calculation of the intercept of two range objects. Used to determine if a datablock contains a slice. PARAMETER DESCRIPTION A range B range RETURNS DESCRIPTION range The intercept of ranges A and B. Source code in tablite/utils.py def intercept(A, B):\n \"\"\"Enables calculation of the intercept of two range objects.\n Used to determine if a datablock contains a slice.\n\n Args:\n A: range\n B: range\n\n Returns:\n range: The intercept of ranges A and B.\n \"\"\"\n type_check(A, range)\n type_check(B, range)\n\n if A.step < 1:\n A = range(A.stop + 1, A.start + 1, 1)\n if B.step < 1:\n B = range(B.stop + 1, B.start + 1, 1)\n\n if len(A) == 0:\n return range(0)\n if len(B) == 0:\n return range(0)\n\n if A.stop <= B.start:\n return range(0)\n if A.start >= B.stop:\n return range(0)\n\n if A.start <= B.start:\n if A.stop <= B.stop:\n start, end = B.start, A.stop\n elif A.stop > B.stop:\n start, end = B.start, B.stop\n else:\n raise ValueError(\"bad logic\")\n elif A.start < B.stop:\n if A.stop <= B.stop:\n start, end = A.start, A.stop\n elif A.stop > B.stop:\n start, end = A.start, B.stop\n else:\n raise ValueError(\"bad logic\")\n else:\n raise ValueError(\"bad logic\")\n\n a_steps = math.ceil((start - A.start) / A.step)\n a_start = (a_steps * A.step) + A.start\n\n b_steps = math.ceil((start - B.start) / B.step)\n b_start = (b_steps * B.step) + B.start\n\n if A.step == 1 or B.step == 1:\n start = max(a_start, b_start)\n step = max(A.step, B.step)\n return range(start, end, step)\n elif A.step == B.step:\n a, b = min(A.start, B.start), max(A.start, B.start)\n if (b - a) % A.step != 0: # then the ranges are offset.\n return range(0)\n else:\n return range(b, end, step)\n else:\n # determine common step size:\n step = max(A.step, B.step) if math.gcd(A.step, B.step) != 1 else A.step * B.step\n # examples:\n # 119 <-- 17 if 1 != 1 else 119 <-- max(7, 17) if math.gcd(7, 17) != 1 else 7 * 17\n # 30 <-- 30 if 3 != 1 else 90 <-- max(3, 30) if math.gcd(3, 30) != 1 else 3*30\n if A.step < B.step:\n for n in range(a_start, end, A.step): # increment in smallest step to identify the first common value.\n if n < b_start:\n continue\n elif (n - b_start) % B.step == 0:\n return range(n, end, step) # common value found.\n else:\n for n in range(b_start, end, B.step):\n if n < a_start:\n continue\n elif (n - a_start) % A.step == 0:\n return range(n, end, step)\n\n return range(0)\n "},{"location":"reference/utils/#tablite.utils.summary_statistics","title":"tablite.utils.summary_statistics(values, counts) ","text":"values: any type counts: integer returns dict with: - min (int/float, length of str, date) - max (int/float, length of str, date) - mean (int/float, length of str, date) - median (int/float, length of str, date) - stdev (int/float, length of str, date) - mode (int/float, length of str, date) - distinct (number of distinct values) - iqr (int/float, length of str, date) - sum (int/float, length of str, date) - histogram (2 arrays: values, count of each values) Source code in tablite/utils.py def summary_statistics(values, counts):\n \"\"\"\n values: any type\n counts: integer\n\n returns dict with:\n - min (int/float, length of str, date)\n - max (int/float, length of str, date)\n - mean (int/float, length of str, date)\n - median (int/float, length of str, date)\n - stdev (int/float, length of str, date)\n - mode (int/float, length of str, date)\n - distinct (number of distinct values)\n - iqr (int/float, length of str, date)\n - sum (int/float, length of str, date)\n - histogram (2 arrays: values, count of each values)\n \"\"\"\n # determine the dominant datatype:\n dtypes = defaultdict(int)\n most_frequent, most_frequent_dtype = 0, int\n for v, c in zip(values, counts):\n dtype = type(v)\n total = dtypes[dtype] + c\n dtypes[dtype] = total\n if total > most_frequent:\n most_frequent_dtype = dtype\n most_frequent = total\n\n if most_frequent == 0:\n return {}\n\n most_frequent_dtype = max(dtypes, key=dtypes.get)\n mask = [type(v) == most_frequent_dtype for v in values]\n v = list(compress(values, mask))\n c = list(compress(counts, mask))\n\n f = summary_methods.get(most_frequent_dtype, int)\n result = f(v, c)\n result[\"distinct\"] = len(values)\n result[\"summary type\"] = most_frequent_dtype.__name__\n result[\"histogram\"] = [values, counts]\n assert set(result.keys()) == required_keys, \"Key missing!\"\n return result\n "},{"location":"reference/utils/#tablite.utils.date_range","title":"tablite.utils.date_range(start, stop, step) ","text":"Source code in tablite/utils.py def date_range(start, stop, step):\n if not isinstance(start, datetime):\n raise TypeError(\"start is not datetime\")\n if not isinstance(stop, datetime):\n raise TypeError(\"stop is not datetime\")\n if not isinstance(step, timedelta):\n raise TypeError(\"step is not timedelta\")\n n = (stop - start) // step\n return [start + step * i for i in range(n)]\n "},{"location":"reference/utils/#tablite.utils.dict_to_rows","title":"tablite.utils.dict_to_rows(d) ","text":"Source code in tablite/utils.py def dict_to_rows(d):\n type_check(d, dict)\n rows = []\n max_length = max(len(i) for i in d.values())\n order = list(d.keys())\n rows.append(order)\n for i in range(max_length):\n row = [d[k][i] for k in order]\n rows.append(row)\n return rows\n "},{"location":"reference/utils/#tablite.utils.calc_col_count","title":"tablite.utils.calc_col_count(letters: str) ","text":"Source code in tablite/utils.py def calc_col_count(letters: str):\n ord_nil = ord(\"A\") - 1\n cols_per_letter = ord(\"Z\") - ord_nil\n col_count = 0\n\n for i, v in enumerate(reversed(letters)):\n col_count = col_count + (ord(v) - ord_nil) * pow(cols_per_letter, i)\n\n return col_count\n "},{"location":"reference/utils/#tablite.utils.calc_true_dims","title":"tablite.utils.calc_true_dims(sheet) ","text":"Source code in tablite/utils.py def calc_true_dims(sheet):\n src = sheet._get_source()\n max_col, max_row = 0, 0\n\n regex = re.compile(\"\\d+\")\n\n def handleStartElement(name, attrs):\n nonlocal max_col, max_row\n\n if name == \"c\":\n last_index = attrs[\"r\"]\n idx, _ = next(regex.finditer(last_index)).span()\n letters, digits = last_index[0:idx], int(last_index[idx:])\n\n col_idx, row_idx = calc_col_count(letters), digits\n\n max_col, max_row = max(max_col, col_idx), max(max_row, row_idx)\n\n parser = expat.ParserCreate()\n parser.buffer_text = True\n parser.StartElementHandler = handleStartElement\n parser.ParseFile(src)\n\n return max_col, max_row\n "},{"location":"reference/utils/#tablite.utils.fixup_worksheet","title":"tablite.utils.fixup_worksheet(worksheet) ","text":"Source code in tablite/utils.py def fixup_worksheet(worksheet):\n try:\n ws_cols, ws_rows = calc_true_dims(worksheet)\n\n worksheet._max_column = ws_cols\n worksheet._max_row = ws_rows\n except Exception as e:\n logging.error(f\"Failed to fetch true dimensions: {e}\")\n "},{"location":"reference/utils/#tablite.utils.update_access_time","title":"tablite.utils.update_access_time(path) ","text":"Source code in tablite/utils.py def update_access_time(path):\n path = Path(path)\n stat = path.stat()\n os.utime(path, (now(), stat.st_mtime))\n "},{"location":"reference/utils/#tablite.utils.load_numpy","title":"tablite.utils.load_numpy(path) ","text":"Source code in tablite/utils.py def load_numpy(path):\n update_access_time(path)\n\n return np.load(path, allow_pickle=True, fix_imports=False)\n "},{"location":"reference/utils/#tablite.utils.select_type_name","title":"tablite.utils.select_type_name(dtypes: dict) ","text":"Source code in tablite/utils.py def select_type_name(dtypes: dict):\n dtypes = [t for t in dtypes.items() if t[0] != NoneType]\n\n if len(dtypes) == 0:\n return \"empty\"\n\n (best_type, _), *_ = sorted(dtypes, key=lambda t: t[1], reverse=True)\n\n return best_type.__name__\n "},{"location":"reference/utils/#tablite.utils.get_predominant_types","title":"tablite.utils.get_predominant_types(table, all_dtypes=None) ","text":"Source code in tablite/utils.py def get_predominant_types(table, all_dtypes=None):\n if all_dtypes is None:\n all_dtypes = table.types()\n\n dtypes = {\n k: select_type_name(v)\n for k, v in all_dtypes.items()\n }\n\n return dtypes\n "},{"location":"reference/utils/#tablite.utils.py_to_nim_encoding","title":"tablite.utils.py_to_nim_encoding(encoding: str) -> str ","text":"Source code in tablite/utils.py def py_to_nim_encoding(encoding: str) -> str:\n if encoding is None or encoding.lower() in [\"ascii\", \"utf8\", \"utf-8\", \"utf-8-sig\"]:\n return \"ENC_UTF8\"\n elif encoding.lower() in [\"utf16\", \"utf-16\"]:\n return \"ENC_UTF16\"\n elif encoding in Config.NIM_SUPPORTED_CONV_TYPES:\n return f\"ENC_CONV|{encoding}\"\n\n raise NotImplementedError(f\"encoding not implemented: {encoding}\")\n "},{"location":"reference/version/","title":"Version","text":""},{"location":"reference/version/#tablite.version","title":"tablite.version ","text":""},{"location":"reference/version/#tablite.version-attributes","title":"Attributes","text":""},{"location":"reference/version/#tablite.version.__version_info__","title":"tablite.version.__version_info__ = (major, minor, patch) module-attribute ","text":""},{"location":"reference/version/#tablite.version.__version__","title":"tablite.version.__version__ = '.'.join(str(i) for i in __version_info__) module-attribute ","text":""}]}
\ No newline at end of file
diff --git a/master/sitemap.xml b/master/sitemap.xml
index 46423ca0..9b4aa0d0 100644
--- a/master/sitemap.xml
+++ b/master/sitemap.xml
@@ -2,147 +2,147 @@
https://root-11.github.io/tablite/master/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/benchmarks/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/changelog/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/tutorial/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/base/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/config/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/core/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/datasets/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/datatypes/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/diff/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/export_utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/file_reader_utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/groupby_utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/import_utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/imputation/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/joins/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/lookup/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/match/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/merge/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/mp_utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/nimlite/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/pivots/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/redux/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/reindex/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/sort_utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/sortation/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/tools/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/utils/
- 2024-04-10
+ 2024-04-12
daily
https://root-11.github.io/tablite/master/reference/version/
- 2024-04-10
+ 2024-04-12
daily
\ No newline at end of file
diff --git a/master/sitemap.xml.gz b/master/sitemap.xml.gz
index d14b302b..e6fc196c 100644
Binary files a/master/sitemap.xml.gz and b/master/sitemap.xml.gz differ
diff --git a/master/tablite/redux.py b/master/tablite/redux.py
index b922367e..826e4993 100644
--- a/master/tablite/redux.py
+++ b/master/tablite/redux.py
@@ -142,7 +142,7 @@ def _compress_one(T, mask):
return new
-def _compress_both(T, mask, pbar: _tqdm):
+def compress_both(T, mask, pbar: _tqdm):
# NOTE FOR DEVELOPERS:
# np.compress is so fast that the overhead of multiprocessing doesn't pay off.
cls = type(T)
@@ -161,30 +161,12 @@ def _compress_both(T, mask, pbar: _tqdm):
data = T[name][start:end]
true_col.extend(np.compress(mask[start:end], data))
false_col.extend(np.compress(np.invert(mask)[start:end], data))
- pbar.update(pbar_step)
+ if pbar is not None:
+ pbar.update(pbar_step)
return true, false
-def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm):
- """
- enables filtering across columns for multiple criteria.
-
- expressions:
-
- str: Expression that can be compiled and executed row by row.
- exampLe: "all((A==B and C!=4 and 200
|
|
|