diff --git a/latest b/latest
index 1698e92f..8b25206f 120000
--- a/latest
+++ b/latest
@@ -1 +1 @@
-2023.11.2
\ No newline at end of file
+master
\ No newline at end of file
diff --git a/master/objects.inv b/master/objects.inv
index 2f27af20..fe6fdcdd 100644
Binary files a/master/objects.inv and b/master/objects.inv differ
diff --git a/master/reference/redux/index.html b/master/reference/redux/index.html
index 212beef4..ba19f11b 100644
--- a/master/reference/redux/index.html
+++ b/master/reference/redux/index.html
@@ -999,6 +999,24 @@
     </span>
   </a>
   
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#tablite.redux.compress_both" class="md-nav__link">
+    <span class="md-ellipsis">
+      &nbsp;compress_both
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#tablite.redux.get_filter_bitmap" class="md-nav__link">
+    <span class="md-ellipsis">
+      &nbsp;get_filter_bitmap
+    </span>
+  </a>
+  
 </li>
         
           <li class="md-nav__item">
@@ -1258,6 +1276,24 @@
     </span>
   </a>
   
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#tablite.redux.compress_both" class="md-nav__link">
+    <span class="md-ellipsis">
+      &nbsp;compress_both
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#tablite.redux.get_filter_bitmap" class="md-nav__link">
+    <span class="md-ellipsis">
+      &nbsp;get_filter_bitmap
+    </span>
+  </a>
+  
 </li>
         
           <li class="md-nav__item">
@@ -1634,6 +1670,274 @@ <h4 id="tablite.redux.filter_any" class="doc doc-heading">
 
 
 
+<h4 id="tablite.redux.compress_both" class="doc doc-heading">
+<code class="doc-symbol doc-symbol-heading doc-symbol-function"></code>          <code class="highlight language-python"><span class="n">tablite</span><span class="o">.</span><span class="n">redux</span><span class="o">.</span><span class="n">compress_both</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">mask</span><span class="p">,</span> <span class="n">pbar</span><span class="p">:</span> <span class="n">_tqdm</span><span class="p">)</span></code>
+
+</h4>
+
+
+  <div class="doc doc-contents ">
+
+          <details class="quote">
+            <summary>Source code in <code>tablite/redux.py</code></summary>
+            <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">145</span>
+<span class="normal">146</span>
+<span class="normal">147</span>
+<span class="normal">148</span>
+<span class="normal">149</span>
+<span class="normal">150</span>
+<span class="normal">151</span>
+<span class="normal">152</span>
+<span class="normal">153</span>
+<span class="normal">154</span>
+<span class="normal">155</span>
+<span class="normal">156</span>
+<span class="normal">157</span>
+<span class="normal">158</span>
+<span class="normal">159</span>
+<span class="normal">160</span>
+<span class="normal">161</span>
+<span class="normal">162</span>
+<span class="normal">163</span>
+<span class="normal">164</span>
+<span class="normal">165</span>
+<span class="normal">166</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">compress_both</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">mask</span><span class="p">,</span> <span class="n">pbar</span><span class="p">:</span> <span class="n">_tqdm</span><span class="p">):</span>
+    <span class="c1"># NOTE FOR DEVELOPERS:</span>
+    <span class="c1"># np.compress is so fast that the overhead of multiprocessing doesn&#39;t pay off.</span>
+    <span class="bp">cls</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="n">T</span><span class="p">)</span>
+    <span class="n">true</span><span class="p">,</span> <span class="n">false</span> <span class="o">=</span> <span class="bp">cls</span><span class="p">(),</span> <span class="bp">cls</span><span class="p">()</span>
+
+    <span class="n">pbar_div</span> <span class="o">=</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">T</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">Config</span><span class="o">.</span><span class="n">page_steps</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">T</span><span class="p">))))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
+    <span class="n">pbar_step</span> <span class="o">=</span> <span class="p">(</span><span class="mi">10</span> <span class="o">/</span> <span class="n">pbar_div</span><span class="p">)</span> <span class="k">if</span> <span class="n">pbar_div</span> <span class="o">!=</span> <span class="mi">0</span> <span class="k">else</span> <span class="mi">0</span>
+
+    <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">T</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
+        <span class="n">true</span><span class="o">.</span><span class="n">add_column</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
+        <span class="n">false</span><span class="o">.</span><span class="n">add_column</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
+        <span class="n">true_col</span> <span class="o">=</span> <span class="n">true</span><span class="p">[</span><span class="n">name</span><span class="p">]</span>  <span class="c1"># fetch the col to avoid doing it in the loop below</span>
+        <span class="n">false_col</span> <span class="o">=</span> <span class="n">false</span><span class="p">[</span><span class="n">name</span><span class="p">]</span>
+        <span class="c1"># prevent OOMError by slicing the getitem ops</span>
+        <span class="k">for</span> <span class="n">start</span><span class="p">,</span> <span class="n">end</span> <span class="ow">in</span> <span class="n">Config</span><span class="o">.</span><span class="n">page_steps</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">T</span><span class="p">)):</span>
+            <span class="n">data</span> <span class="o">=</span> <span class="n">T</span><span class="p">[</span><span class="n">name</span><span class="p">][</span><span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">]</span>
+            <span class="n">true_col</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">compress</span><span class="p">(</span><span class="n">mask</span><span class="p">[</span><span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">],</span> <span class="n">data</span><span class="p">))</span>
+            <span class="n">false_col</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">compress</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">invert</span><span class="p">(</span><span class="n">mask</span><span class="p">)[</span><span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">],</span> <span class="n">data</span><span class="p">))</span>
+            <span class="k">if</span> <span class="n">pbar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="n">pbar</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">pbar_step</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">true</span><span class="p">,</span> <span class="n">false</span>
+</code></pre></div></td></tr></table></div>
+          </details>
+  </div>
+
+</div>
+
+
+<div class="doc doc-object doc-function">
+
+
+
+<h4 id="tablite.redux.get_filter_bitmap" class="doc doc-heading">
+<code class="doc-symbol doc-symbol-heading doc-symbol-function"></code>          <code class="highlight language-python"><span class="n">tablite</span><span class="o">.</span><span class="n">redux</span><span class="o">.</span><span class="n">get_filter_bitmap</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">pbar</span><span class="p">:</span> <span class="n">_tqdm</span><span class="p">)</span></code>
+
+</h4>
+
+
+  <div class="doc doc-contents ">
+
+          <details class="quote">
+            <summary>Source code in <code>tablite/redux.py</code></summary>
+            <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">169</span>
+<span class="normal">170</span>
+<span class="normal">171</span>
+<span class="normal">172</span>
+<span class="normal">173</span>
+<span class="normal">174</span>
+<span class="normal">175</span>
+<span class="normal">176</span>
+<span class="normal">177</span>
+<span class="normal">178</span>
+<span class="normal">179</span>
+<span class="normal">180</span>
+<span class="normal">181</span>
+<span class="normal">182</span>
+<span class="normal">183</span>
+<span class="normal">184</span>
+<span class="normal">185</span>
+<span class="normal">186</span>
+<span class="normal">187</span>
+<span class="normal">188</span>
+<span class="normal">189</span>
+<span class="normal">190</span>
+<span class="normal">191</span>
+<span class="normal">192</span>
+<span class="normal">193</span>
+<span class="normal">194</span>
+<span class="normal">195</span>
+<span class="normal">196</span>
+<span class="normal">197</span>
+<span class="normal">198</span>
+<span class="normal">199</span>
+<span class="normal">200</span>
+<span class="normal">201</span>
+<span class="normal">202</span>
+<span class="normal">203</span>
+<span class="normal">204</span>
+<span class="normal">205</span>
+<span class="normal">206</span>
+<span class="normal">207</span>
+<span class="normal">208</span>
+<span class="normal">209</span>
+<span class="normal">210</span>
+<span class="normal">211</span>
+<span class="normal">212</span>
+<span class="normal">213</span>
+<span class="normal">214</span>
+<span class="normal">215</span>
+<span class="normal">216</span>
+<span class="normal">217</span>
+<span class="normal">218</span>
+<span class="normal">219</span>
+<span class="normal">220</span>
+<span class="normal">221</span>
+<span class="normal">222</span>
+<span class="normal">223</span>
+<span class="normal">224</span>
+<span class="normal">225</span>
+<span class="normal">226</span>
+<span class="normal">227</span>
+<span class="normal">228</span>
+<span class="normal">229</span>
+<span class="normal">230</span>
+<span class="normal">231</span>
+<span class="normal">232</span>
+<span class="normal">233</span>
+<span class="normal">234</span>
+<span class="normal">235</span>
+<span class="normal">236</span>
+<span class="normal">237</span>
+<span class="normal">238</span>
+<span class="normal">239</span>
+<span class="normal">240</span>
+<span class="normal">241</span>
+<span class="normal">242</span>
+<span class="normal">243</span>
+<span class="normal">244</span>
+<span class="normal">245</span>
+<span class="normal">246</span>
+<span class="normal">247</span>
+<span class="normal">248</span>
+<span class="normal">249</span>
+<span class="normal">250</span>
+<span class="normal">251</span>
+<span class="normal">252</span>
+<span class="normal">253</span>
+<span class="normal">254</span>
+<span class="normal">255</span>
+<span class="normal">256</span>
+<span class="normal">257</span>
+<span class="normal">258</span>
+<span class="normal">259</span>
+<span class="normal">260</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">get_filter_bitmap</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">pbar</span><span class="p">:</span> <span class="n">_tqdm</span><span class="p">):</span>
+    <span class="k">for</span> <span class="n">expression</span> <span class="ow">in</span> <span class="n">expressions</span><span class="p">:</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">expression</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
+            <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;invalid expression: </span><span class="si">{</span><span class="n">expression</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">expression</span><span class="p">)</span> <span class="o">==</span> <span class="mi">3</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;expected 3 items, got </span><span class="si">{</span><span class="n">expression</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="n">x</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;column1&quot;</span><span class="p">,</span> <span class="s2">&quot;column2&quot;</span><span class="p">,</span> <span class="s2">&quot;criteria&quot;</span><span class="p">,</span> <span class="s2">&quot;value1&quot;</span><span class="p">,</span> <span class="s2">&quot;value2&quot;</span><span class="p">}</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="nb">set</span><span class="p">(</span><span class="n">expression</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;got unknown key: </span><span class="si">{</span><span class="nb">set</span><span class="p">(</span><span class="n">expression</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="o">.</span><span class="n">difference</span><span class="p">(</span><span class="n">x</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="k">if</span> <span class="n">expression</span><span class="p">[</span><span class="s2">&quot;criteria&quot;</span><span class="p">]</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">filter_ops</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;criteria missing from </span><span class="si">{</span><span class="n">expression</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="n">c1</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;column1&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">c1</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">c1</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">T</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;no such column: </span><span class="si">{</span><span class="n">c1</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="n">v1</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;value1&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">v1</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">c1</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;filter can only take 1 left expr element. Got 2.&quot;</span><span class="p">)</span>
+
+        <span class="n">c2</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;column2&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">c2</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">c2</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">T</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;no such column: </span><span class="si">{</span><span class="n">c2</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="n">v2</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;value2&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="k">if</span> <span class="n">v2</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">c2</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;filter can only take 1 right expression element. Got 2.&quot;</span><span class="p">)</span>
+
+    <span class="c1"># EVALUATION....</span>
+    <span class="c1"># 1. setup a rectangular bitmap for evaluations</span>
+    <span class="n">bitmap</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">expressions</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">T</span><span class="p">)),</span> <span class="n">dtype</span><span class="o">=</span><span class="nb">bool</span><span class="p">)</span>
+    <span class="n">pbar_div</span> <span class="o">=</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">expressions</span><span class="p">)</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">Config</span><span class="o">.</span><span class="n">page_steps</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">T</span><span class="p">))))</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span>
+    <span class="n">pbar_step</span> <span class="o">=</span> <span class="p">(</span><span class="mi">10</span> <span class="o">/</span> <span class="n">pbar_div</span><span class="p">)</span> <span class="k">if</span> <span class="n">pbar_div</span> <span class="o">!=</span> <span class="mi">0</span> <span class="k">else</span> <span class="mi">0</span>
+    <span class="c1"># 2. create tasks for evaluations</span>
+    <span class="k">for</span> <span class="n">bit_index</span><span class="p">,</span> <span class="n">expression</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">expressions</span><span class="p">):</span>
+        <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">expression</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span>
+        <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">expression</span><span class="p">)</span> <span class="o">==</span> <span class="mi">3</span>
+        <span class="n">c1</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;column1&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="n">c2</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;column2&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="n">expr</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;criteria&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="k">assert</span> <span class="n">expr</span> <span class="ow">in</span> <span class="n">filter_ops</span>
+        <span class="n">v1</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;value1&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="n">v2</span> <span class="o">=</span> <span class="n">expression</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;value2&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+
+        <span class="k">for</span> <span class="n">start</span><span class="p">,</span> <span class="n">end</span> <span class="ow">in</span> <span class="n">Config</span><span class="o">.</span><span class="n">page_steps</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">T</span><span class="p">)):</span>
+            <span class="k">if</span> <span class="n">c1</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="n">dset_A</span> <span class="o">=</span> <span class="n">T</span><span class="p">[</span><span class="n">c1</span><span class="p">][</span><span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">]</span>
+            <span class="k">else</span><span class="p">:</span>  <span class="c1"># v1 is active:</span>
+                <span class="n">dset_A</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="n">v1</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">end</span> <span class="o">-</span> <span class="n">start</span><span class="p">))</span>
+
+            <span class="k">if</span> <span class="n">c2</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="n">dset_B</span> <span class="o">=</span> <span class="n">T</span><span class="p">[</span><span class="n">c2</span><span class="p">][</span><span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">]</span>
+            <span class="k">else</span><span class="p">:</span>  <span class="c1"># v2 is active:</span>
+                <span class="n">dset_B</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="n">v2</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">end</span> <span class="o">-</span> <span class="n">start</span><span class="p">))</span>
+
+            <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">dset_A</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">dset_B</span><span class="p">):</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;Assymmetric dataset: </span><span class="si">{</span><span class="n">c1</span><span class="si">}</span><span class="s2"> has </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">dset_A</span><span class="p">)</span><span class="si">}</span><span class="s2"> values, whilst </span><span class="si">{</span><span class="n">c2</span><span class="si">}</span><span class="s2"> has </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">dset_B</span><span class="p">)</span><span class="si">}</span><span class="s2"> values.&quot;</span>
+                <span class="p">)</span>
+            <span class="c1"># Evaluate</span>
+            <span class="k">try</span><span class="p">:</span>
+                <span class="k">if</span> <span class="n">expr</span> <span class="o">==</span> <span class="s2">&quot;&gt;&quot;</span><span class="p">:</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">dset_A</span> <span class="o">&gt;</span> <span class="n">dset_B</span>
+                <span class="k">elif</span> <span class="n">expr</span> <span class="o">==</span> <span class="s2">&quot;&gt;=&quot;</span><span class="p">:</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">dset_A</span> <span class="o">&gt;=</span> <span class="n">dset_B</span>
+                <span class="k">elif</span> <span class="n">expr</span> <span class="o">==</span> <span class="s2">&quot;==&quot;</span><span class="p">:</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">dset_A</span> <span class="o">==</span> <span class="n">dset_B</span>
+                <span class="k">elif</span> <span class="n">expr</span> <span class="o">==</span> <span class="s2">&quot;&lt;&quot;</span><span class="p">:</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">dset_A</span> <span class="o">&lt;</span> <span class="n">dset_B</span>
+                <span class="k">elif</span> <span class="n">expr</span> <span class="o">==</span> <span class="s2">&quot;&lt;=&quot;</span><span class="p">:</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">dset_A</span> <span class="o">&lt;=</span> <span class="n">dset_B</span>
+                <span class="k">elif</span> <span class="n">expr</span> <span class="o">==</span> <span class="s2">&quot;!=&quot;</span><span class="p">:</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">dset_A</span> <span class="o">!=</span> <span class="n">dset_B</span>
+                <span class="k">else</span><span class="p">:</span>  <span class="c1"># it&#39;s a python evaluations (slow)</span>
+                    <span class="n">f</span> <span class="o">=</span> <span class="n">filter_ops</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">expr</span><span class="p">)</span>
+                    <span class="k">assert</span> <span class="nb">callable</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+                    <span class="n">result</span> <span class="o">=</span> <span class="n">list_to_np_array</span><span class="p">([</span><span class="n">f</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span> <span class="k">for</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">dset_A</span><span class="p">,</span> <span class="n">dset_B</span><span class="p">)])</span>
+            <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
+                <span class="k">def</span> <span class="nf">safe_test</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">):</span>
+                    <span class="k">try</span><span class="p">:</span>
+                        <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span>
+                    <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span>
+                        <span class="k">return</span> <span class="kc">False</span>
+                <span class="n">f</span> <span class="o">=</span> <span class="n">filter_ops</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">expr</span><span class="p">)</span>
+                <span class="k">assert</span> <span class="nb">callable</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+                <span class="n">result</span> <span class="o">=</span> <span class="n">list_to_np_array</span><span class="p">([</span><span class="n">safe_test</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span> <span class="k">for</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">dset_A</span><span class="p">,</span> <span class="n">dset_B</span><span class="p">)])</span>
+            <span class="n">bitmap</span><span class="p">[</span><span class="n">bit_index</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span><span class="n">end</span><span class="p">]</span> <span class="o">=</span> <span class="n">result</span>
+            <span class="k">if</span> <span class="n">pbar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+                <span class="n">pbar</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">pbar_step</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="n">bitmap</span>
+</code></pre></div></td></tr></table></div>
+          </details>
+  </div>
+
+</div>
+
+
+<div class="doc doc-object doc-function">
+
+
+
 <h4 id="tablite.redux.filter_non_primitive" class="doc doc-heading">
 <code class="doc-symbol doc-symbol-heading doc-symbol-function"></code>          <code class="highlight language-python"><span class="n">tablite</span><span class="o">.</span><span class="n">redux</span><span class="o">.</span><span class="n">filter_non_primitive</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">filter_type</span><span class="o">=</span><span class="s1">&#39;all&#39;</span><span class="p">,</span> <span class="n">tqdm</span><span class="o">=</span><span class="n">_tqdm</span><span class="p">)</span></code>
 
@@ -1774,15 +2078,7 @@ <h4 id="tablite.redux.filter_non_primitive" class="doc doc-heading">
 
           <details class="quote">
             <summary>Source code in <code>tablite/redux.py</code></summary>
-            <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">288</span>
-<span class="normal">289</span>
-<span class="normal">290</span>
-<span class="normal">291</span>
-<span class="normal">292</span>
-<span class="normal">293</span>
-<span class="normal">294</span>
-<span class="normal">295</span>
-<span class="normal">296</span>
+            <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">296</span>
 <span class="normal">297</span>
 <span class="normal">298</span>
 <span class="normal">299</span>
@@ -1826,7 +2122,15 @@ <h4 id="tablite.redux.filter_non_primitive" class="doc doc-heading">
 <span class="normal">337</span>
 <span class="normal">338</span>
 <span class="normal">339</span>
-<span class="normal">340</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">filter_non_primitive</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">filter_type</span><span class="o">=</span><span class="s2">&quot;all&quot;</span><span class="p">,</span> <span class="n">tqdm</span><span class="o">=</span><span class="n">_tqdm</span><span class="p">):</span>
+<span class="normal">340</span>
+<span class="normal">341</span>
+<span class="normal">342</span>
+<span class="normal">343</span>
+<span class="normal">344</span>
+<span class="normal">345</span>
+<span class="normal">346</span>
+<span class="normal">347</span>
+<span class="normal">348</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">filter_non_primitive</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">filter_type</span><span class="o">=</span><span class="s2">&quot;all&quot;</span><span class="p">,</span> <span class="n">tqdm</span><span class="o">=</span><span class="n">_tqdm</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    OBSOLETE</span>
 <span class="sd">    filters table</span>
@@ -1875,7 +2179,7 @@ <h4 id="tablite.redux.filter_non_primitive" class="doc doc-heading">
         <span class="k">else</span><span class="p">:</span>
             <span class="k">raise</span> <span class="ne">TypeError</span>
         <span class="c1"># create new tables</span>
-        <span class="n">res</span> <span class="o">=</span> <span class="n">_compress_both</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">mask</span><span class="p">,</span> <span class="n">pbar</span><span class="o">=</span><span class="n">pbar</span><span class="p">)</span>
+        <span class="n">res</span> <span class="o">=</span> <span class="n">compress_both</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">mask</span><span class="p">,</span> <span class="n">pbar</span><span class="o">=</span><span class="n">pbar</span><span class="p">)</span>
         <span class="n">pbar</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">pbar</span><span class="o">.</span><span class="n">total</span> <span class="o">-</span> <span class="n">pbar</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
 
         <span class="k">return</span> <span class="n">res</span>
@@ -2030,15 +2334,7 @@ <h4 id="tablite.redux.filter" class="doc doc-heading">
 
           <details class="quote">
             <summary>Source code in <code>tablite/redux.py</code></summary>
-            <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">342</span>
-<span class="normal">343</span>
-<span class="normal">344</span>
-<span class="normal">345</span>
-<span class="normal">346</span>
-<span class="normal">347</span>
-<span class="normal">348</span>
-<span class="normal">349</span>
-<span class="normal">350</span>
+            <div class="highlight"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre><span></span><span class="normal">350</span>
 <span class="normal">351</span>
 <span class="normal">352</span>
 <span class="normal">353</span>
@@ -2080,7 +2376,15 @@ <h4 id="tablite.redux.filter" class="doc doc-heading">
 <span class="normal">389</span>
 <span class="normal">390</span>
 <span class="normal">391</span>
-<span class="normal">392</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">filter_type</span><span class="o">=</span><span class="s2">&quot;all&quot;</span><span class="p">,</span> <span class="n">tqdm</span><span class="o">=</span><span class="n">_tqdm</span><span class="p">):</span>
+<span class="normal">392</span>
+<span class="normal">393</span>
+<span class="normal">394</span>
+<span class="normal">395</span>
+<span class="normal">396</span>
+<span class="normal">397</span>
+<span class="normal">398</span>
+<span class="normal">399</span>
+<span class="normal">400</span></pre></div></td><td class="code"><div><pre><span></span><code><span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">filter_type</span><span class="o">=</span><span class="s2">&quot;all&quot;</span><span class="p">,</span> <span class="n">tqdm</span><span class="o">=</span><span class="n">_tqdm</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;filters table</span>
 <span class="sd">    Note: At the moment only tablite primitive types are supported</span>
 
@@ -2122,7 +2426,7 @@ <h4 id="tablite.redux.filter" class="doc doc-heading">
             <span class="c1"># TODO: make parser for expressions and use the nim implement</span>
             <span class="n">mask</span> <span class="o">=</span> <span class="n">_filter_using_expression</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">)</span>
             <span class="n">pbar</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
-            <span class="n">res</span> <span class="o">=</span> <span class="n">_compress_both</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">mask</span><span class="p">,</span> <span class="n">pbar</span><span class="o">=</span><span class="n">pbar</span><span class="p">)</span>
+            <span class="n">res</span> <span class="o">=</span> <span class="n">compress_both</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">mask</span><span class="p">,</span> <span class="n">pbar</span><span class="o">=</span><span class="n">pbar</span><span class="p">)</span>
             <span class="n">pbar</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">pbar</span><span class="o">.</span><span class="n">total</span> <span class="o">-</span> <span class="n">pbar</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
     <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">expressions</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
         <span class="k">return</span> <span class="n">_filter_using_list_of_dicts_native</span><span class="p">(</span><span class="n">T</span><span class="p">,</span> <span class="n">expressions</span><span class="p">,</span> <span class="n">filter_type</span><span class="p">,</span> <span class="n">tqdm</span><span class="p">)</span>
diff --git a/master/search/search_index.json b/master/search/search_index.json
index 64828110..b3d33d55 100644
--- a/master/search/search_index.json
+++ b/master/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Tablite","text":""},{"location":"#contents","title":"Contents","text":"<ul> <li>introduction</li> <li>installation</li> <li>feature overview</li> <li>api</li> <li>tutorial</li> <li>latest updates</li> <li>credits</li> </ul>"},{"location":"#introduction","title":"Introduction","text":"<p><code>Tablite</code> seeks to be the go-to library for manipulating tabular data with an api that is as close in syntax to pure python as possible. </p>"},{"location":"#even-smaller-memory-footprint","title":"Even smaller memory footprint","text":"<p>Tablite uses numpys fileformat as a backend with strong abstraction, so that copy, append &amp; repetition of data is handled in pages. This is imperative for incremental data processing.</p> <p>Tablite tests for memory footprint. One test compares the memory footprint of 10,000,000 integers where <code>tablite</code> will use &lt; 1 Mb RAM in contrast to python which will require around 133.7 Mb of RAM (1M lists with 10 integers). Tablite also tests to assure that working with 1Tb of data is tolerable.</p> <p>Tablite achieves this minimal memory footprint by using a temporary storage set in <code>config.Config.workdir</code> as <code>tempfile.gettempdir()/tablite-tmp</code>. If your OS (windows/linux/mac) sits on a SSD this will benefit from high IOPS and permit slices of 9,000,000,000 rows in less than a second.</p>"},{"location":"#multiprocessing-enabled-by-default","title":"Multiprocessing enabled by default","text":"<p>Tablite uses numpy whereever possible and applies multiprocessing for bypassing the GIL on all major operations.  CSV import is performed in C through using <code>nim</code>s compiler and is as fast the hardware allows.</p>"},{"location":"#all-algorithms-have-been-reworked-to-respect-memory-limits","title":"All algorithms have been reworked to respect memory limits","text":"<p>Tablite respects the limits of free memory by tagging the free memory and defining task size before each memory intensive task is initiated (join, groupby, data import, etc). If you still run out of memory you may try to reduce the <code>config.Config.PAGE_SIZE</code> and rerun your program.</p>"},{"location":"#100-support-for-all-python-datatypes","title":"100% support for all python datatypes","text":"<p>Tablite wants to make it easy for you to work with data. <code>tablite.Table's</code> behave like a dict with lists:</p> <p><code>my_table[column name] = [... data ...]</code>.</p> <p>Tablite uses datatype mapping to native numpy types where possible and uses type mapping for non-native types such as timedelta, None, date, time\u2026 e.g. what you put in, is what you get out. This is inspired by bank python.</p>"},{"location":"#light-weight","title":"Light weight","text":"<p>Tablite is ~200 kB.</p>"},{"location":"#helpful","title":"Helpful","text":"<p>Tablite wants you to be productive, so a number of helpers are available. </p> <ul> <li><code>Table.import_file</code> to import csv*, tsv, txt, xls, xlsx, xlsm, ods, zip and logs. There is automatic type detection (see tutorial.ipynb )</li> <li>To peek into any supported file use <code>get_headers</code> which shows the first 10 rows.</li> <li>Use <code>mytable.rows</code> and <code>mytable.columns</code> to iterate over rows or columns.</li> <li>Create multi-key <code>.index</code> for quick lookups.</li> <li>Perform multi-key <code>.sort</code>,</li> <li>Filter using <code>.any</code> and <code>.all</code> to select specific rows.</li> <li>use multi-key <code>.lookup</code> and <code>.join</code> to find data across tables.</li> <li>Perform <code>.groupby</code> and reorganise data as a <code>.pivot</code> table with max, min, sum, first, last, count, unique, average, st.deviation, median and mode</li> <li>Append / concatenate tables with <code>+=</code> which automatically sorts out the columns - even if they're not in perfect order.</li> <li>Should you tables be similar but not the identical you can use <code>.stack</code> to \"stack\" tables on top of each other</li> </ul> <p>If you're still missing something add it to the wishlist</p>"},{"location":"#installation","title":"Installation","text":"<p>Get it from pypi: </p> <p>Install: <code>pip install tablite</code> Usage:  <code>&gt;&gt;&gt; from tablite import Table</code> </p>"},{"location":"#build-test","title":"Build &amp; test","text":"<p>install nim &gt;= 2.0.0</p> <p>run: <code>chmod +x ./build_nim.sh</code> run: <code>./build_nim.sh</code></p> <p>Should the default nim not be your desired taste, please use <code>nims</code> environment manager (<code>atlas</code>) and run <code>source nim-2.0.0/activate.sh</code> on UNIX or <code>nim-2.0.0/activate.bat</code> on windows.</p> <pre><code>install python &gt;= 3.8\npython -m venv /your/venv/dir\nactivate /your/venv/dir\npip install -r requirements.txt\npip install -r requirements_for_testing.py\npytest ./tests\n</code></pre>"},{"location":"#feature-overview","title":"Feature overview","text":"want to... this way... loop over rows <code>[ row for row in table.rows ]</code> loop over columns <code>[ table[col_name] for col_name in table.columns ]</code> slice <code>myslice = table['A', 'B', slice(0,None,15)]</code> get column by name <code>my_table['A']</code> get row by index <code>my_table[9_000_000_001]</code> value update <code>mytable['A'][2] = new value</code> update w. list comprehension <code>mytable['A'] = [ x*x for x in mytable['A'] if x % 2 != 0 ]</code> join <code>a_join = numbers.join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'], kind='left')</code> lookup <code>travel_plan = friends.lookup(bustable, (DataTypes.time(21, 10), \"&lt;=\", 'time'), ('stop', \"==\", 'stop'))</code> groupby <code>group_by = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)])</code> pivot table <code>my_pivot = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False)</code> index <code>indices = old_table.index(*old_table.columns)</code> sort <code>lookup1_sorted = lookup_1.sort(**{'time': True, 'name':False, \"sort_mode\":'unix'})</code> filter <code>true, false = unfiltered.filter( [{\"column1\": 'a', \"criteria\":\"&gt;=\", 'value2':3}, ... more criteria ... ], filter_type='all' )</code> find any <code>any_even_rows = mytable.any('A': lambda x : x%2==0, 'B': lambda x &gt; 0)</code> find all <code>all_even_rows = mytable.all('A': lambda x : x%2==0, 'B': lambda x &gt; 0)</code> to json <code>json_str = my_table.to_json()</code> from json <code>Table.from_json(json_str)</code>"},{"location":"#api","title":"API","text":"<p>To view the detailed API see api</p>"},{"location":"#tutorial","title":"Tutorial","text":"<p>To learn more see the tutorial.ipynb (Jupyter notebook)</p>"},{"location":"#latest-updates","title":"Latest updates","text":"<p>See changelog.md</p>"},{"location":"#credits","title":"Credits","text":"<ul> <li>Eugene Antonov - the api documentation.</li> <li>Audrius Kulikajevas - Edge case testing / various bugs, Jupyter notebook integration.</li> <li>Ovidijus Grigas - various bugs, documentation.</li> <li>Martynas Kaunas - GroupBy functionality.</li> <li>Sergej Sinkarenko - various bugs.</li> <li>Lori Cooper - spell checking.</li> </ul>"},{"location":"benchmarks/","title":"Benchmarks","text":"In\u00a0[2]: Copied! <pre>import psutil, os, gc, shutil, tempfile\nfrom pathlib import Path\nfrom time import perf_counter, time\nfrom tablite import Table\nfrom tablite.datasets import synthetic_order_data\nfrom tablite.config import Config\n\nConfig.TQDM_DISABLE = True\n</pre> import psutil, os, gc, shutil, tempfile from pathlib import Path from time import perf_counter, time from tablite import Table from tablite.datasets import synthetic_order_data from tablite.config import Config  Config.TQDM_DISABLE = True In\u00a0[3]: Copied! <pre>process = psutil.Process(os.getpid())\n\ndef make_tables(sizes=[1,2,5,10,20,50]):\n    # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.\n    t = synthetic_order_data(Config.PAGE_SIZE)\n    real, flat = t.nbytes()\n    print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\")\n\n    tables = [t]  # 1M rows.\n\n    last = 1\n    t2 = t.copy()\n    for i in sizes[1:]:\n        t2 = t2.copy()\n        for _ in range(i-last):\n            t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique\n        last = i\n        real, flat = t2.nbytes()\n        tables.append(t2)\n        print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\")\n    return tables\n\ntables = make_tables()\n</pre> process = psutil.Process(os.getpid())  def make_tables(sizes=[1,2,5,10,20,50]):     # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.     t = synthetic_order_data(Config.PAGE_SIZE)     real, flat = t.nbytes()     print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\")      tables = [t]  # 1M rows.      last = 1     t2 = t.copy()     for i in sizes[1:]:         t2 = t2.copy()         for _ in range(i-last):             t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique         last = i         real, flat = t2.nbytes()         tables.append(t2)         print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\")     return tables  tables = make_tables() <pre>Table 1,000,000 rows is 256 Mb on disk\nTable 2,000,000 rows is 512 Mb on disk\nTable 5,000,000 rows is 1,280 Mb on disk\nTable 10,000,000 rows is 2,560 Mb on disk\nTable 20,000,000 rows is 5,120 Mb on disk\nTable 50,000,000 rows is 12,800 Mb on disk\n</pre> <p>The values in the tables above are all unique!</p> In\u00a0[4]: Copied! <pre>tables[-1]\n</pre> tables[-1] Out[4]: ~#1234567891011         0114014953182952021-10-06T00:00:0050814119375C3-4HGQ21\u00b0XYZ1.244647268201734421.367107051830455         129320231372182021-08-26T00:00:005007718568C5-5FZU0\u00b00.55294485347516132.6980406874392537         2312569602250812021-12-21T00:00:0050197029074C2-3GTK6\u00b0XYZ1.99739754559065617.513164305723787         3414012777817432021-08-23T00:00:0050818024969C4-3BYP6\u00b0XYZ0.047497125538289577.388171617130485         459426667674262021-07-31T00:00:0050307113074C5-2CCC21\u00b0ABC1.0219215027612885.21324123446987         5612186131851272021-12-01T00:00:0050484117249C5-4WGT21\u00b00.2038764258434556712.190974436133764         676070424343982021-11-29T00:00:0050578011564C2-3LUL0\u00b0XYZ2.2367835158480444.340628097363572.......................................49,999,9939999946602693775472021-09-17T00:00:005015409706C4-3AHQ21\u00b0XYZ0.083216645843125856.56780297752790549,999,9949999955709798646952021-08-01T00:00:0050149125006C1-2FWH6\u00b01.04763923662266419.50710544462706549,999,9959999963551956078252021-07-29T00:00:0050007026992C4-3GVG21\u00b02.20440816560941411.2706443974284949,999,99699999720762240577282021-10-16T00:00:0050950113339C5-4NKS0\u00b02.1593110498135494.21575620046596149,999,9979999986577247891352021-12-21T00:00:0050069114747C2-4LYGNone1.64809640191698683.094420483625827349,999,9989999999775312438842021-12-02T00:00:0050644129345C2-5DRH6\u00b02.30911421692753110.82706867207146849,999,999100000012290713920652021-08-23T00:00:0050706119732C4-5AGB6\u00b00.488871405593691630.8580085696389939 In\u00a0[5]: Copied! <pre>def save_load_benchmarks(tables):\n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)\n\n    results = Table()\n    results.add_columns('rows', 'save (sec)', 'load (sec)')\n    for t in tables:\n        fn = tmp / f'{len(t)}.tpz'\n        start = perf_counter()\n        t.save(fn)\n        end = perf_counter()\n        save = round(end-start,3)\n        assert fn.exists()\n        \n        \n        start = perf_counter()\n        t2 = Table.load(fn)\n        end = perf_counter()\n        load = round(end-start,3)\n        print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\")\n        del t2\n        fn.unlink()\n        results.add_rows(len(t), save, load)\n    \n    r = results\n    r['save r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ]\n    r['load r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])]\n\n    return results\n</pre> def save_load_benchmarks(tables):     tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)      results = Table()     results.add_columns('rows', 'save (sec)', 'load (sec)')     for t in tables:         fn = tmp / f'{len(t)}.tpz'         start = perf_counter()         t.save(fn)         end = perf_counter()         save = round(end-start,3)         assert fn.exists()                           start = perf_counter()         t2 = Table.load(fn)         end = perf_counter()         load = round(end-start,3)         print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\")         del t2         fn.unlink()         results.add_rows(len(t), save, load)          r = results     r['save r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ]     r['load r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])]      return results  In\u00a0[6]: Copied! <pre>slb = save_load_benchmarks(tables)\n</pre> slb = save_load_benchmarks(tables) <pre>saving 1,000,000 rows (49 Mb) took 2.148 seconds. loading took 0.922 seconds\nsaving 2,000,000 rows (98 Mb) took 4.267 seconds. loading took 1.820 seconds\nsaving 5,000,000 rows (246 Mb) took 10.618 seconds. loading took 4.482 seconds\nsaving 10,000,000 rows (492 Mb) took 21.291 seconds. loading took 8.944 seconds\nsaving 20,000,000 rows (984 Mb) took 42.603 seconds. loading took 17.821 seconds\nsaving 50,000,000 rows (2,461 Mb) took 106.644 seconds. loading took 44.600 seconds\n</pre> In\u00a0[7]: Copied! <pre>slb\n</pre> slb Out[7]: #rowssave (sec)load (sec)save r/secload r/sec 010000002.1480.9224655491084598 120000004.2671.824687131098901 2500000010.6184.4824708981115573 31000000021.2918.9444696821118067 42000000042.60317.8214694501122271 550000000106.64444.64688491121076 <p>With various compression options</p> In\u00a0[8]: Copied! <pre>def save_compression_benchmarks(t):\n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)\n\n    import zipfile  # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile\n    methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")]\n    methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)]\n    methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)]\n\n    results = Table()\n    results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')\n    for level, method, name in methods:\n        fn = tmp / f'{len(t)}.tpz'\n        start = perf_counter()  \n        t.save(fn, compression_method=method, compression_level=level)\n        end = perf_counter()\n        write = round(end-start,3)\n        assert fn.exists()\n        size = int(fn.stat().st_size/1e6)\n        # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='')\n        \n        start = perf_counter()\n        t2 = Table.load(fn)\n        end = perf_counter()\n        read = round(end-start,3)\n        # print(f\" and {end-start:,.3} seconds to load\")\n        print(\".\", end='')\n        \n        del t2\n        fn.unlink()\n        results.add_rows(size, f\"{name}(level={level})\", write, read)\n        \n    \n    r = results\n    r.sort({'write (sec)':True})\n    r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]\n    r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]\n    return results\n</pre> def save_compression_benchmarks(t):     tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)      import zipfile  # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile     methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")]     methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)]     methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)]      results = Table()     results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')     for level, method, name in methods:         fn = tmp / f'{len(t)}.tpz'         start = perf_counter()           t.save(fn, compression_method=method, compression_level=level)         end = perf_counter()         write = round(end-start,3)         assert fn.exists()         size = int(fn.stat().st_size/1e6)         # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='')                  start = perf_counter()         t2 = Table.load(fn)         end = perf_counter()         read = round(end-start,3)         # print(f\" and {end-start:,.3} seconds to load\")         print(\".\", end='')                  del t2         fn.unlink()         results.add_rows(size, f\"{name}(level={level})\", write, read)                   r = results     r.sort({'write (sec)':True})     r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]     r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]     return results In\u00a0[9]: Copied! <pre>scb = save_compression_benchmarks(tables[0])\n</pre> scb = save_compression_benchmarks(tables[0]) <pre>.....................</pre> <pre>creating sort index:   0%|          | 0/1 [00:00&lt;?, ?it/s]\rcreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 268.92it/s]\n</pre> In\u00a0[10]: Copied! <pre>scb[0:20]\n</pre> scb[0:20] Out[10]: #file size (Mb)methodwrite (sec)read (sec)write (rps)read (rps) 0256zip stored(level=None)0.3960.47525252522105263 129zip lzma(level=None)95.1372.22810511448833 2256zip deflated(level=0)0.5350.59518691581680672 349zip deflated(level=1)2.150.9224651161084598 447zip deflated(level=2)2.2640.9124416961096491 543zip deflated(level=3)3.0490.833279761204819 644zip deflated(level=4)2.920.8623424651160092 742zip deflated(level=5)4.0340.8692478921150747 840zip deflated(level=6)8.5580.81168491250000 939zip deflated(level=7)13.6950.7787301912853471038zip deflated(level=8)56.9720.7921755212626261138zip deflated(level=9)122.6230.791815512642221229zip bzip2(level=1)15.1214.065661332460021329zip bzip2(level=2)16.0474.214623162373041429zip bzip2(level=3)16.8584.409593192268081529zip bzip2(level=4)17.6485.141566631945141629zip bzip2(level=5)18.6746.009535501664171729zip bzip2(level=6)19.4056.628515331508751829zip bzip2(level=7)19.9546.714501151489421929zip bzip2(level=8)20.5956.96148555143657 <p>Conclusions</p> <ul> <li>Fastest: zip stored with no compression takes handles</li> </ul> In\u00a0[11]: Copied! <pre>def to_sql_benchmark(t, rows=1_000_000):\n    t2 = t[:rows]\n    write_start = time()\n    _ = t2.to_sql(name='1')\n    write_end = time()\n    write = round(write_end-write_start,3)\n    return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" )\n</pre> def to_sql_benchmark(t, rows=1_000_000):     t2 = t[:rows]     write_start = time()     _ = t2.to_sql(name='1')     write_end = time()     write = round(write_end-write_start,3)     return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" )  In\u00a0[12]: Copied! <pre>def to_json_benchmark(t, rows=1_000_000):\n    t2 = t[:rows]\n\n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)\n    path = tmp / \"1.json\" \n    \n    write_start = time()\n    bytestr = t2.to_json()\n    with path.open('w') as fo:\n        fo.write(bytestr)\n    write_end = time()\n    write = round(write_end-write_start,3)\n\n    read_start = time()\n    with path.open('r') as fi:\n        _ = Table.from_json(fi.read())  # &lt;-- JSON\n    read_end = time()\n    read = round(read_end-read_start,3)\n\n    return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" )\n</pre> def to_json_benchmark(t, rows=1_000_000):     t2 = t[:rows]      tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)     path = tmp / \"1.json\"           write_start = time()     bytestr = t2.to_json()     with path.open('w') as fo:         fo.write(bytestr)     write_end = time()     write = round(write_end-write_start,3)      read_start = time()     with path.open('r') as fi:         _ = Table.from_json(fi.read())  # &lt;-- JSON     read_end = time()     read = round(read_end-read_start,3)      return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" )  In\u00a0[13]: Copied! <pre>def f(t, args):\n    rows, c1, c1_kw, c2, c2_kw = args\n    t2 = t[:rows]\n\n    call = getattr(t2, c1)\n    assert callable(call)\n\n    write_start = time()\n    call(**c1_kw)\n    write_end = time()\n    write = round(write_end-write_start,3)\n\n    for _ in range(10):\n        gc.collect()\n\n    read_start = time()\n    if callable(c2):\n        c2(**c2_kw)\n    read_end = time()\n    read = round(read_end-read_start,3)\n\n    fn = c2_kw['path']\n    assert fn.exists()\n    fs = int(fn.stat().st_size/1e6)\n    config = {k:v for k,v in c2_kw.items() if k!= 'path'}\n\n    return ( c1, write, read, len(t2), fs , str(config))\n</pre> def f(t, args):     rows, c1, c1_kw, c2, c2_kw = args     t2 = t[:rows]      call = getattr(t2, c1)     assert callable(call)      write_start = time()     call(**c1_kw)     write_end = time()     write = round(write_end-write_start,3)      for _ in range(10):         gc.collect()      read_start = time()     if callable(c2):         c2(**c2_kw)     read_end = time()     read = round(read_end-read_start,3)      fn = c2_kw['path']     assert fn.exists()     fs = int(fn.stat().st_size/1e6)     config = {k:v for k,v in c2_kw.items() if k!= 'path'}      return ( c1, write, read, len(t2), fs , str(config))  In\u00a0[14]: Copied! <pre>def import_export_benchmarks(tables):\n    Config.PROCESSING_MODE = Config.FALSE\n        \n    t = sorted(tables, key=lambda x: len(x), reverse=True)[0]\n    \n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)   \n\n    args = [\n        (   100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}),\n        (    50_000,  \"to_ods\",  {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ),  # 50k rows, otherwise MemoryError.\n        ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'}                           ),\n        ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n        (10_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n        ( 1_000_000,  \"to_tsv\",  {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'}                           ),\n        ( 1_000_000, \"to_text\",  {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'}                           ),\n        ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'}                          ),\n        ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'}                          )\n    ]\n\n    results = Table()\n    results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')\n\n    results.add_rows( to_sql_benchmark(t) )\n    results.add_rows( to_json_benchmark(t) )\n\n    for arg in args:\n        if len(t)&lt;arg[0]:\n            continue\n        print(\".\", end='')\n        try:\n            results.add_rows( f(t, arg) )\n        except MemoryError:\n            results.add_rows( arg[1], \"Memory Error\", \"NIL\", args[0], \"NIL\", \"N/A\")\n    \n    r = results\n    r['read r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['read (s)']) ]\n    r['write r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['write (s)'])]\n\n    shutil.rmtree(tmp)\n    return results\n</pre> def import_export_benchmarks(tables):     Config.PROCESSING_MODE = Config.FALSE              t = sorted(tables, key=lambda x: len(x), reverse=True)[0]          tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)         args = [         (   100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}),         (    50_000,  \"to_ods\",  {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ),  # 50k rows, otherwise MemoryError.         ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'}                           ),         ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),         (10_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),         ( 1_000_000,  \"to_tsv\",  {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'}                           ),         ( 1_000_000, \"to_text\",  {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'}                           ),         ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'}                          ),         ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'}                          )     ]      results = Table()     results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')      results.add_rows( to_sql_benchmark(t) )     results.add_rows( to_json_benchmark(t) )      for arg in args:         if len(t) In\u00a0[15]: Copied! <pre>ieb = import_export_benchmarks(tables)\n</pre> ieb = import_export_benchmarks(tables) <pre>.........writing 12,000,000 records to /tmp/junk/1.hdf5... done\n</pre> In\u00a0[16]: Copied! <pre>ieb\n</pre> ieb Out[16]: #methodwrite (s)read (s)rowssize (Mb)configread r/secwrite r/sec 0to_sql12.34501000000nil81004 1to_json10.8144.406100000014222696392472 2to_xlsx10.56921.5721000009{'sheet': 'pyexcel_sheet1'}46359461 3to_ods29.17529.487500003{'sheet': 'pyexcel_sheet1'}16951713 4to_csv14.31515.7311000000108{}6356869856 5to_csv14.4388.1691000000108{'guess_datatypes': False}12241469261 6to_csv140.64599.45100000001080{'guess_datatypes': False}10055371100 7to_tsv13.83415.7631000000108{}6343972285 8to_text13.93715.6821000000108{}6376771751 9to_html12.5780.531000000228{}18867927950310to_hdf55.0112.3451000000316{}81004199600 <p>Conclusions</p> <p>Best:</p> <ul> <li>to/from JSON wins with 2.3M rps read</li> <li>to/from CSV/TSV/TEXT comes 2nd with config <code>guess_datatypes=False</code> with ~ 100k rps</li> </ul> <p>Worst:</p> <ul> <li>to/from ods burst the memory footprint and hence had to be reduced to 100k rows. It also had the slowest read rate with 1450 rps.</li> </ul> In\u00a0[17]: Copied! <pre>def contains_benchmark(table):\n    results = Table()\n    results.add_columns( \"column\", \"time (s)\" )\n    for name,col in table.columns.items():\n        n = len(col)\n        start,stop,step = int(n*0.02), int(n*0.98), int(n/100)\n        selection = col[start:stop:step]\n        total_time = 0.0\n        for v in selection:\n            start_time = perf_counter()\n            v in col  # &lt;--- test!\n            end_time = perf_counter()\n            total_time += (end_time - start_time)\n        avg_time = total_time / len(selection)\n        results.add_rows( name, round(avg_time,3) )\n\n    return results\n</pre> def contains_benchmark(table):     results = Table()     results.add_columns( \"column\", \"time (s)\" )     for name,col in table.columns.items():         n = len(col)         start,stop,step = int(n*0.02), int(n*0.98), int(n/100)         selection = col[start:stop:step]         total_time = 0.0         for v in selection:             start_time = perf_counter()             v in col  # &lt;--- test!             end_time = perf_counter()             total_time += (end_time - start_time)         avg_time = total_time / len(selection)         results.add_rows( name, round(avg_time,3) )      return results      In\u00a0[18]: Copied! <pre>has_it = contains_benchmark(tables[-1])\nhas_it\n</pre> has_it = contains_benchmark(tables[-1]) has_it Out[18]: #columntime (s) 0#0.001 110.043 220.032 330.001 440.001 550.001 660.006 770.003 880.006 990.00710100.04311110.655 In\u00a0[19]: Copied! <pre>def slicing_benchmark(table):\n    n = len(table)\n    start,stop,step = int(0.02*n), int(0.98*n), int(n / 20)  # from 2% to 98% in 20 large steps\n    start_time = perf_counter()\n    snip = table[start:stop:step]\n    end_time = perf_counter()\n    print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\")\n    return snip\n</pre> def slicing_benchmark(table):     n = len(table)     start,stop,step = int(0.02*n), int(0.98*n), int(n / 20)  # from 2% to 98% in 20 large steps     start_time = perf_counter()     snip = table[start:stop:step]     end_time = perf_counter()     print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\")     return snip In\u00a0[20]: Copied! <pre>slice_it = slicing_benchmark(tables[-1])\n</pre> slice_it = slicing_benchmark(tables[-1]) <pre>reading 50,000,000 rows to find 20 rows took 1.435 sec\n</pre> In\u00a0[22]: Copied! <pre>def column_selection_benchmark(tables):\n    results = Table()\n    results.add_columns( 'rows')\n    results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)])\n\n    for table in tables:\n        rr = [len(table)]\n        for ix, name in enumerate(table.columns):\n            cols = list(table.columns)[:ix+1]\n            start_time = perf_counter()\n            table[cols]\n            end_time = perf_counter()\n            rr.append(f\"{end_time-start_time:.5f}\")\n        results.add_rows( rr )\n    return results\n</pre> def column_selection_benchmark(tables):     results = Table()     results.add_columns( 'rows')     results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)])      for table in tables:         rr = [len(table)]         for ix, name in enumerate(table.columns):             cols = list(table.columns)[:ix+1]             start_time = perf_counter()             table[cols]             end_time = perf_counter()             rr.append(f\"{end_time-start_time:.5f}\")         results.add_rows( rr )     return results In\u00a0[23]: Copied! <pre>csb = column_selection_benchmark(tables)\nprint(\"times below are are in seconds\")\ncsb\n</pre> csb = column_selection_benchmark(tables) print(\"times below are are in seconds\") csb <pre>times below are are in seconds\n</pre> Out[23]: #rowsn cols=1n cols=2n cols=3n cols=4n cols=5n cols=6n cols=7n cols=8n cols=9n cols=10n cols=11n cols=12 010000000.000010.000060.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 120000000.000010.000080.000030.000030.000030.000030.000030.000030.000030.000030.000040.00004 250000000.000010.000050.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 3100000000.000020.000050.000040.000040.000040.000040.000070.000050.000050.000050.000050.00005 4200000000.000030.000060.000050.000050.000050.000050.000060.000060.000060.000060.000060.00006 5500000000.000090.000110.000100.000090.000090.000090.000090.000090.000090.000090.000100.00009 In\u00a0[33]: Copied! <pre>def iterrows_benchmark(table):\n    results = Table()\n    results.add_columns( 'n columns', 'time (s)')\n\n    columns = ['1']\n    for column in list(table.columns):\n        columns.append(column)\n        snip = table[columns, slice(500_000,1_500_000)]\n        start_time = perf_counter()\n        counts = 0\n        for row in snip.rows:\n            counts += 1\n        end_time = perf_counter()\n        results.add_rows( len(columns), round(end_time-start_time,3))\n\n    return results\n</pre> def iterrows_benchmark(table):     results = Table()     results.add_columns( 'n columns', 'time (s)')      columns = ['1']     for column in list(table.columns):         columns.append(column)         snip = table[columns, slice(500_000,1_500_000)]         start_time = perf_counter()         counts = 0         for row in snip.rows:             counts += 1         end_time = perf_counter()         results.add_rows( len(columns), round(end_time-start_time,3))      return results  In\u00a0[34]: Copied! <pre>iterb = iterrows_benchmark(tables[-1])\niterb\n</pre> iterb = iterrows_benchmark(tables[-1]) iterb Out[34]: #n columnstime (s) 029.951 139.816 249.859 359.93 469.985 579.942 689.958 799.867 8109.96 9119.93210129.8311139.861 In\u00a0[35]: Copied! <pre>import matplotlib.pyplot as plt\nplt.plot(iterb['n columns'], iterb['time (s)'])\nplt.show()\n</pre> import matplotlib.pyplot as plt plt.plot(iterb['n columns'], iterb['time (s)']) plt.show()  In\u00a0[28]: Copied! <pre>tables[-1].types()\n</pre> tables[-1].types() Out[28]: <pre>{'#': {int: 50000000},\n '1': {int: 50000000},\n '2': {str: 50000000},\n '3': {int: 50000000},\n '4': {int: 50000000},\n '5': {int: 50000000},\n '6': {str: 50000000},\n '7': {str: 50000000},\n '8': {str: 50000000},\n '9': {str: 50000000},\n '10': {float: 50000000},\n '11': {str: 50000000}}</pre> In\u00a0[29]: Copied! <pre>def dtypes_benchmark(tables):\n    dtypes_results = Table()\n    dtypes_results.add_columns(\"rows\", \"time (s)\")\n\n    for table in tables:\n        start_time = perf_counter()\n        dt = table.types()\n        end_time = perf_counter()\n        assert isinstance(dt, dict) and len(dt) != 0\n        dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )\n\n    return dtypes_results\n</pre> def dtypes_benchmark(tables):     dtypes_results = Table()     dtypes_results.add_columns(\"rows\", \"time (s)\")      for table in tables:         start_time = perf_counter()         dt = table.types()         end_time = perf_counter()         assert isinstance(dt, dict) and len(dt) != 0         dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )      return dtypes_results In\u00a0[30]: Copied! <pre>dtype_b = dtypes_benchmark(tables)\ndtype_b\n</pre> dtype_b = dtypes_benchmark(tables) dtype_b Out[30]: #rowstime (s) 010000000.0 120000000.0 250000000.0 3100000000.0 4200000000.0 5500000000.001 In\u00a0[31]: Copied! <pre>def any_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n\n    for table in tables:\n        tmp = [len(table)]\n        for column in list(table.columns):\n            v = table[column][0]\n            start_time = perf_counter()\n            _ = table.any(**{column: v})\n            end_time = perf_counter()           \n            tmp.append(round(end_time-start_time,3))\n\n        results.add_rows( tmp )\n    return results\n</pre> def any_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))      for table in tables:         tmp = [len(table)]         for column in list(table.columns):             v = table[column][0]             start_time = perf_counter()             _ = table.any(**{column: v})             end_time = perf_counter()                        tmp.append(round(end_time-start_time,3))          results.add_rows( tmp )     return results  In\u00a0[32]: Copied! <pre>anyb = any_benchmark(tables)\nanyb\n</pre>  anyb = any_benchmark(tables) anyb Out[32]: ~rows#1234567891011 010000000.1330.1330.1780.1330.2920.1470.1690.1430.2270.2590.1460.17 120000000.2680.2630.3430.2650.5670.2940.3350.2750.4640.5230.2890.323 250000000.6690.6530.9140.6691.4360.7230.8380.6941.1741.3350.6780.818 3100000001.3141.351.7451.3362.9021.491.6831.4142.3542.6181.3431.536 4200000002.5562.5343.3372.6025.6452.8273.2252.6464.5145.082.6933.083 5500000006.5716.4238.4556.69914.4847.9897.7986.25910.98912.486.7327.767 In\u00a0[36]: Copied! <pre>def all_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n\n    for table in tables:\n        tmp = [len(table)]\n        for column in list(table.columns):\n            v = table[column][0]\n            start_time = perf_counter()\n            _ = table.all(**{column: v})\n            end_time = perf_counter()           \n            tmp.append(round(end_time-start_time,3))\n\n        results.add_rows( tmp )\n    return results\n</pre> def all_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))      for table in tables:         tmp = [len(table)]         for column in list(table.columns):             v = table[column][0]             start_time = perf_counter()             _ = table.all(**{column: v})             end_time = perf_counter()                        tmp.append(round(end_time-start_time,3))          results.add_rows( tmp )     return results  In\u00a0[37]: Copied! <pre>allb = all_benchmark(tables)\nallb\n</pre> allb = all_benchmark(tables) allb Out[37]: ~rows#1234567891011 010000000.120.1210.1620.1220.2640.1380.1550.1270.2090.2370.1330.151 120000000.2370.2350.3110.2380.520.2660.2970.3410.4510.530.2610.285 250000000.6750.6980.9520.5941.6050.6590.8120.7191.2241.3530.6640.914 3100000001.3141.3321.7071.3323.0911.4631.7811.3662.3582.6381.4091.714 4200000002.5762.3133.112.3965.2072.5732.9212.4034.0414.6582.4632.808 5500000005.8965.827.735.95612.9097.457.275.98110.18311.5766.3727.414 In\u00a0[\u00a0]: Copied! <pre>\n</pre> In\u00a0[38]: Copied! <pre>def unique_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n    \n    for table in tables:\n        length = len(table)\n\n        tmp = [len(table)]\n        for column in list(table.columns):\n            start_time = perf_counter()\n            try:\n                L = table[column].unique()\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            tmp.append(round(dt,3))\n            assert 0 &lt; len(L) &lt;= length    \n\n        results.add_rows( tmp )\n    return results\n</pre> def unique_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))          for table in tables:         length = len(table)          tmp = [len(table)]         for column in list(table.columns):             start_time = perf_counter()             try:                 L = table[column].unique()                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             tmp.append(round(dt,3))             assert 0 &lt; len(L) &lt;= length              results.add_rows( tmp )     return results In\u00a0[39]: Copied! <pre>ubm = unique_benchmark(tables)\nubm\n</pre> ubm = unique_benchmark(tables) ubm Out[39]: ~rows#1234567891011 010000000.0220.0810.2480.0440.0160.0610.1150.1360.0960.0850.0940.447 120000000.1760.2710.5050.0870.0310.1240.2290.2790.1980.170.3051.471 250000000.1980.4991.2630.2180.0760.3110.570.6850.4740.4250.5952.744 3100000000.5021.1232.5350.4330.1550.6151.1281.3750.960.851.3165.826 4200000000.9562.3365.0350.8830.3191.2292.2682.7481.9131.7462.73311.883 5500000002.3956.01912.4992.1780.7643.0735.6086.8194.8284.2797.09730.511 In\u00a0[40]: Copied! <pre>def index_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n    \n    for table in tables:\n\n        tmp = [len(table)]\n        for column in list(table.columns):\n            start_time = perf_counter()\n            try:\n                _ = table.index(column)\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            tmp.append(round(dt,3))\n            \n        results.add_rows( tmp )\n    return results\n</pre> def index_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))          for table in tables:          tmp = [len(table)]         for column in list(table.columns):             start_time = perf_counter()             try:                 _ = table.index(column)                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             tmp.append(round(dt,3))                      results.add_rows( tmp )     return results   In\u00a0[41]: Copied! <pre>ibm = index_benchmark(tables)\nibm\n</pre> ibm = index_benchmark(tables) ibm Out[41]: ~rows#1234567891011 010000001.9491.7931.4321.1061.0511.231.3381.4931.4111.3031.9992.325 120000002.8833.5172.8562.2172.1242.4622.6762.9862.7092.6064.0494.461 250000006.3829.0497.0965.6285.3536.3126.6497.5216.716.45910.2710.747 31000000012.55318.50613.9511.33510.72412.50913.3315.05113.50212.89919.76921.999 42000000024.71737.89628.56822.66621.47226.32727.15730.06427.33225.82238.31143.399 55000000063.01697.07772.00755.60954.09961.79768.23675.0769.02266.15299.183109.969 <p>Multi-column index next:</p> In\u00a0[42]: Copied! <pre>def multi_column_index_benchmark(tables):\n    \n    selection = [\"4\", \"7\", \"8\", \"9\"]\n    results = Table()\n    results.add_columns(\"rows\", *range(1,len(selection)+1))\n    \n    for table in tables:\n\n        tmp = [len(table)]\n        for index in range(1,5):\n            start_time = perf_counter()\n            try:\n                _ = table.index(*selection[:index])\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            tmp.append(round(dt,3))\n            print('.', end='')\n            \n        results.add_rows( tmp )\n    return results\n</pre> def multi_column_index_benchmark(tables):          selection = [\"4\", \"7\", \"8\", \"9\"]     results = Table()     results.add_columns(\"rows\", *range(1,len(selection)+1))          for table in tables:          tmp = [len(table)]         for index in range(1,5):             start_time = perf_counter()             try:                 _ = table.index(*selection[:index])                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             tmp.append(round(dt,3))             print('.', end='')                      results.add_rows( tmp )     return results   In\u00a0[43]: Copied! <pre>mcib = multi_column_index_benchmark(tables)\nmcib\n</pre> mcib = multi_column_index_benchmark(tables) mcib <pre>........................</pre> Out[43]: #rows1234 010000001.0582.1333.2154.052 120000002.124.2786.5468.328 250000005.30310.8916.69320.793 31000000010.58122.40733.46241.91 42000000021.06445.95467.78184.828 55000000052.347109.551166.6211.053 In\u00a0[44]: Copied! <pre>def drop_duplicates_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n    \n    for table in tables:\n        result = [len(table)]\n        cols = []\n        for name in list(table.columns):\n            cols.append(name)\n            start_time = perf_counter()\n            try:\n                _ = table.drop_duplicates(*cols)\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            result.append(round(dt,3))\n            print('.', end='')\n        \n        results.add_rows( result )\n    return results\n</pre> def drop_duplicates_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))          for table in tables:         result = [len(table)]         cols = []         for name in list(table.columns):             cols.append(name)             start_time = perf_counter()             try:                 _ = table.drop_duplicates(*cols)                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             result.append(round(dt,3))             print('.', end='')                  results.add_rows( result )     return results   In\u00a0[45]: Copied! <pre>ddb = drop_duplicates_benchmark(tables)\nddb\n</pre> ddb = drop_duplicates_benchmark(tables) ddb <pre>........................................................................</pre> Out[45]: ~rows#1234567891011 010000001.7612.3583.3133.9014.6154.9615.8356.5347.4548.1088.8039.682 120000003.0114.936.9347.979.26410.26812.00613.51714.9216.63117.93219.493 250000006.82713.85318.63721.23724.54827.1131.15735.02638.99243.53146.02250.433 31000000013.23831.74641.14146.91753.17258.24167.99274.65182.7491.45897.666104.82 42000000025.93277.75100.34109.314123.514131.874148.432163.57179.121196.047208.686228.059 55000000064.237312.222364.886388.249429.724466.685494.418535.367581.666607.306634.343683.858"},{"location":"benchmarks/#benchmarks","title":"Benchmarks\u00b6","text":"<p>These benchmarks seek to establish the performance of tablite as a user sees it.</p> <p>Overview</p> Input/Output Various column functions Base functions Core functions - Save / Load .tpz format- Save tables to various formats- Import data from various formats - Setitem / getitem- iter- equal, not equal- copy- t += t- t *= t- contains- remove all- replace- index- unique- histogram- statistics- count - Setitem / getitem- iter / rows- equal, not equal- load- save- copy- stack- types- display_dict- show- to_dict- as_json_serializable- index - expression- filter- sort_index- reindex- drop_duplicates- sort- is_sorted- any- all- drop - replace- groupby- pivot- joins- lookup- replace missing values- transpose- pivot_transpose- diff"},{"location":"benchmarks/#input-output","title":"Input / Output\u00b6","text":""},{"location":"benchmarks/#create-tables-from-synthetic-data","title":"Create tables from synthetic data.\u00b6","text":""},{"location":"benchmarks/#save-load-tpz-format","title":"Save / Load .tpz format\u00b6","text":"<p>Without default compression settings (10% slower than uncompressed, 20% of uncompressed filesize)</p>"},{"location":"benchmarks/#save-load-tables-to-from-various-formats","title":"Save / load tables to / from various formats\u00b6","text":"<p>The handlers for saving / export are:</p> <ul> <li>to_sql</li> <li>to_json</li> <li>to_xls</li> <li>to_ods</li> <li>to_csv</li> <li>to_tsv</li> <li>to_text</li> <li>to_html</li> <li>to_hdf5</li> </ul>"},{"location":"benchmarks/#various-column-functions","title":"Various column functions\u00b6","text":"<ul> <li>Setitem / getitem</li> <li>iter</li> <li>equal, not equal</li> <li>copy</li> <li>t += t</li> <li>t *= t</li> <li>contains</li> <li>remove all</li> <li>replace</li> <li>index</li> <li>unique</li> <li>histogram</li> <li>statistics</li> <li>count</li> </ul>"},{"location":"benchmarks/#various-table-functions","title":"Various table functions\u00b6","text":""},{"location":"benchmarks/#slicing","title":"Slicing\u00b6","text":"<p>Slicing operations are used in many places.</p>"},{"location":"benchmarks/#tabletypes","title":"Table.types()\u00b6","text":"<p>Table.types() is implemented for near constant speed lookup.</p> <p>Here is an example:</p>"},{"location":"benchmarks/#tableany","title":"Table.any\u00b6","text":""},{"location":"benchmarks/#tableall","title":"Table.all\u00b6","text":""},{"location":"benchmarks/#tablefilter","title":"Table.filter\u00b6","text":""},{"location":"benchmarks/#tableunique","title":"Table.unique\u00b6","text":""},{"location":"benchmarks/#tableindex","title":"Table.index\u00b6","text":"<p>Single column index first:</p>"},{"location":"benchmarks/#drop-duplicates","title":"drop duplicates\u00b6","text":""},{"location":"changelog/","title":"Changelog","text":"Version Change 2023.9.0 Adding <code>Table.match</code> operation. 2023.8.0 Nim backend for csv importer.Improve excel importer.Improve slicing consistency.Logical cores re-enabled on *nix based systems.Filter is now type safe.Added merge utility.Various bugfixes. 2023.6.5 Fix issues with <code>get_headers</code> falling back to text reading when reading 0 lines of excel, fix issue where reading excel file would ignore file count, excel file reader now has parity for linecount selection. 2023.6.4 Fix a logic bug in <code>get_headers</code> that caused one extra line to be returned than requested. 2023.6.3 Updated the way reference counting works. Tablite now tracks references to used pages and cleans them up based on number of references to those pages in the current process. This change allows to handle deep table clones when sending tables via processes (pickling/unpickling), whereas previous implementation would corrupt all tables using same pages due to reference counting asserting that all tables are shallow copies to the same object. 2023.6.2 Updated <code>mplite</code> dependency, changed to soft version requirement to prevent pipeline freezes due to small bugfixes in <code>mplite</code>. 2023.6.1 Major change of the backend processes. Speed up of ~6x. For more see the release notes 2022.11.19 Fixed some memory leaks. 2022.11.18 <code>copy</code>, <code>filter</code>, <code>sort</code>, <code>any</code>, <code>all</code> methods now properly respects the table subclass.Filter for tables with under <code>SINGLE_PROCESSING_LIMIT</code> rows will run on same process to reduce overhead.Errors within child processes now properly propagate to parent.<code>Table.reset_storage(include_imports=True)</code> now allows the user to reset the storage but exclude any imported files by setting <code>include_imports=False</code> during <code>Table.reset(...)</code>.Bug: A column with <code>1,None,2</code> would be written to csv &amp; tsv as <code>\"1,None,2\"</code>. Now it is written <code>\"1,,2\"</code> where None means absent.Fix mp <code>join</code> producing mismatched columns lengths when different table lengths are used as an input or when join product is longer than the input table. 2022.11.17 <code>Table.load</code> now properly subclassess the table instead of always resulting in <code>tablite.Table</code>.<code>Table.from_*</code> methods now respect subclassess, fixed some <code>from_*</code> methods which were instance methods and not class methods.Fixed <code>Table.from_dict</code> only accepting <code>list</code> and <code>tuple</code> but not <code>tablite.Column</code> which is an equally valid type.Fix <code>lookup</code> parity in single process and multiple process outputs.Fix an issue with multiprocess <code>lookup</code> where no matches would throw instead of producing <code>None</code>.Fix an issue with filtering an empty table. 2022.11.16 Changed <code>join</code> to process 1M rows per task to avoid potential OOM on lower memory systems. Added <code>mp_merge_columns</code> to <code>MemoryManager</code> that merges column pages into a single column.Fix <code>join</code> parity in single process and multiple process outputs.Fix an issue with multiprocess <code>join</code> where no matches would throw instead of producing <code>None</code>. 2022.11.15 Bump <code>mplite</code> to avoid deadlock issues OS kill the process. 2022.11.14 Improve locking mechanism to allow retries when opening file as the previous solution could cause deadlocks when running multiple threads. 2022.11.13 Fix an issue with copying empty pages. 2022.11.12 Tablite now is now able to create it's own temporary directory. 2022.11.11 <code>text_reader</code> tqdm tracks the entire process now.  <code>text_reader</code> properly respects free memory in *nix based systems.  <code>text_reader</code> no longer discriminates against hyperthreaded cores. 2022.11.10 <code>get_headers</code> now uses plain <code>openpyxl</code> instead of <code>pyexcel</code> wrapper to speed up fetch times ~10x on certain files. 2022.11.9 <code>get_headers</code> can fail safe on unrecognized characters. 2022.11.8 Fix a bug with task size calculation on single core systems. 2022.11.7 Added <code>TABLITE_TMPDIR</code> environment variable for setting tablite work directory.  Characters that fail to be read text reader due to improper encoding will be skipped.  Fixed an issue where single column text files with no column delimiters would be imported as empty tables. 2022.11.6 Date inference fix 2022.11.5 Fixed negative slicing issues 2022.11.4 Transpose API changes:  <code>table.transpose(...)</code> was renamed to <code>table.pivot_transpose(...)</code>  new <code>table.transpose()</code> and <code>table.T</code> were added, it's functionality acts similarly to <code>numpy.T</code>, the column headers are used the first row in the table when transposing. 2022.11.3 Bugfix for non-ascii encoded strings during <code>t.add_rows(...)</code> 2022.11.2 As <code>utf-8</code> is ascii compatible, the file reader utils selects <code>utf-8</code> instead of <code>ascii</code> as a default. 2022.11.1 bugfix in <code>datatypes.infer()</code> where 1 was inferred as int, not float. 2022.11.0 New table features: <code>Table.diff(other, columns=...)</code>, <code>table.remove_duplicates_rows()</code>, <code>table.drop_na(*arg)</code>,<code>table.replace(target,replacement)</code>, <code>table.imputation(sources, targets, methods=...)</code>, <code>table.to_pandas()</code> and <code>Table.from_pandas(pd.DataFrame)</code>,<code>table.to_dict(columns, slice)</code>, <code>Table.from_dict()</code>,<code>table.transpose(columns, keep, ...)</code>, New column features:  <code>Column.count(item)</code>, <code>Column[:]</code> is guaranteed to return a python list.<code>Column.to_numpy(slice)</code> returns <code>np.ndarray</code>.  new <code>tools</code> library: <code>from tablite import tools</code> with:  <code>date_range(start,end)</code>, <code>xround(value, multiple, up=None)</code>, and,  <code>guess</code> as short-cut for <code>Datatypes.guess(...)</code>. bugfixes:  <code>__eq__</code> was updated but missed <code>__ne__</code>.<code>in</code> operator in filter would crash if datatypes were not strings. 2022.10.11 filter now accepts any expression (str) that can be compiled by pythons compiler 2022.10.11 Bugfix for <code>.any</code> and <code>.all</code>. The code now executes much faster 2022.10.10 Bugfix for <code>Table.import_file</code>: <code>import_as</code> has been removed from keywords. 2022.10.10 All Table functions now have tqdm progressbar. 2022.10.10 More robust calculation for task size for multiprocessing. 2022.10.10 Dependency update: mplite==1.2.0 is now required. 2022.10.9 Bugfix for <code>Table.import_file</code>: files with duplicate header names would only have last duplicate name imported.Now the headers are made unique using <code>name_x</code> where x is a number. 2022.10.8 Bugfix for groupby: Where keys are empty error should have been raised.Where there are no functions, unique keypairs are returned. 2022.10.7 Bugfix for Column.statistics() for an empty column 2022.10.6 Bugfix for <code>__setitem__</code>: tbl['a'] = [] is now seen as <code>tbl.add_column('a')</code>Bugfix for <code>__getitem__</code>: calling a missing key raises keyerror. 2022.10.5 Bugfix for summary statistics. 2022.10.4 Bugfix for join shortcut. 2022.10.3 Bugfix for DataTypes where bool was evaluated wrongly 2022.10.0 Added ability to reindex in <code>table.reindex(index=[0,1...,n,n-1])</code> 2022.9.0 Added ability to store python objects (example).Added warning when user iterates over non-rectangular dataset. 2022.8.0 Added <code>table.export(path)</code> which exports tablite Tables to file format given by the file extension. For example <code>my_table.export('example.xlsx')</code>.supported formats are: <code>json</code>, <code>html</code>, <code>xlsx</code>, <code>xls</code>, <code>csv</code>, <code>tsv</code>, <code>txt</code>, <code>ods</code> and <code>sql</code>. 2022.7.8 Added ability to forward <code>tqdm</code> progressbar into <code>Table.import_file(..., tqdm=your_tqdm)</code>, so that Jupyter notebook can use it in <code>display</code>-methods. 2022.7.7 Added method <code>Table.to_sql()</code> for export to ANSI-92 SQL enginesBugfix on to_json for <code>timedelta</code>. Jupyter notebook provides nice view using <code>Table._repr_html_()</code> JS-users can use <code>.as_json_serializable</code> where suitable. 2022.7.6 get_headers now takes argument <code>(path, linecount=10)</code> 2022.7.5 added helper <code>Table.as_json_serializable</code> as Jupyterkernel compat. 2022.7.4 adder helper <code>Table.to_dict</code>, and updated <code>Table.to_json</code> 2022.7.3 table.to_json now takes kwargs: <code>row_count</code>, <code>columns</code>, <code>slice_</code>, <code>start_on</code> 2022.7.2 documentation update. 2022.7.1 minor bugfix. 2022.7.0 BREAKING CHANGES- Tablite now uses HDF5 as backend. - Has multiprocessing enabled by default. - Is 20x faster. - Completely new API. 2022.6.0 <code>DataTypes.guess([list of strings])</code> returns the best matching python datatype."},{"location":"tutorial/","title":"Tutorial","text":"In\u00a0[1]: Copied! <pre>from tablite import Table\n\n## To create a tablite table is as simple as populating a dictionary:\nt = Table({'A':[1,2,3], 'B':['a','b','c']})\n</pre> from tablite import Table  ## To create a tablite table is as simple as populating a dictionary: t = Table({'A':[1,2,3], 'B':['a','b','c']}) In\u00a0[2]: Copied! <pre>## In this notebook we can show tables in the HTML style:\nt\n</pre> ## In this notebook we can show tables in the HTML style: t Out[2]: #AB 01a 12b 23c In\u00a0[3]: Copied! <pre>## or the ascii style:\nt.show()\n</pre> ## or the ascii style: t.show() <pre>+==+=+=+\n|# |A|B|\n+--+-+-+\n| 0|1|a|\n| 1|2|b|\n| 2|3|c|\n+==+=+=+\n</pre> In\u00a0[4]: Copied! <pre>## or if you'd like to inspect the table, use:\nprint(str(t))\n</pre> ## or if you'd like to inspect the table, use: print(str(t)) <pre>Table(2 columns, 3 rows)\n</pre> In\u00a0[5]: Copied! <pre>## You can also add all columns at once (slower) if you prefer. \nt2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c')))\nassert t==t2\n</pre> ## You can also add all columns at once (slower) if you prefer.  t2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c'))) assert t==t2 In\u00a0[6]: Copied! <pre>## or load data:\nt3 = Table.from_file('tests/data/book1.csv')\n\n## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show(). \n## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows.\nt3\n</pre> ## or load data: t3 = Table.from_file('tests/data/book1.csv')  ## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show().  ## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows. t3 <pre>Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n</pre> <pre>importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 487.82it/s]\n</pre> Out[6]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[7]: Copied! <pre>## should you however want to select the headers instead of importing everything\n## (which maybe timeconsuming), simply use get_headers(path)\nfrom tablite.tools import get_headers\nfrom pathlib import Path\npath = Path('tests/data/book1.csv')\nsample = get_headers(path, linecount=5)\nprint(f\"sample is of type {type(sample)} and has the following entries:\")\nfor k,v in sample.items():\n    print(k)\n    if isinstance(v,list):\n        for r in sample[k]:\n            print(\"\\t\", r)\n</pre> ## should you however want to select the headers instead of importing everything ## (which maybe timeconsuming), simply use get_headers(path) from tablite.tools import get_headers from pathlib import Path path = Path('tests/data/book1.csv') sample = get_headers(path, linecount=5) print(f\"sample is of type {type(sample)} and has the following entries:\") for k,v in sample.items():     print(k)     if isinstance(v,list):         for r in sample[k]:             print(\"\\t\", r) <pre>sample is of type &lt;class 'dict'&gt; and has the following entries:\ndelimiter\nbook1.csv\n\t ['a', 'b', 'c', 'd', 'e', 'f']\n\t ['1', '0.060606061', '0.090909091', '0.121212121', '0.151515152', '0.181818182']\n\t ['2', '0.121212121', '0.242424242', '0.484848485', '0.96969697', '1.939393939']\n\t ['3', '0.242424242', '0.484848485', '0.96969697', '1.939393939', '3.878787879']\n\t ['4', '0.484848485', '0.96969697', '1.939393939', '3.878787879', '7.757575758']\n\t ['5', '0.96969697', '1.939393939', '3.878787879', '7.757575758', '15.51515152']\n</pre> In\u00a0[8]: Copied! <pre>## to extend a table by adding columns, use t[new] = [new values]\nt['C'] = [4,5,6]\n## but make sure the column has the same length as the rest of the table!\nt\n</pre> ## to extend a table by adding columns, use t[new] = [new values] t['C'] = [4,5,6] ## but make sure the column has the same length as the rest of the table! t Out[8]: #ABC 01a4 12b5 23c6 In\u00a0[9]: Copied! <pre>## should you want to mix datatypes, tablite will not complain:\nfrom datetime import datetime, date,time,timedelta\nimport numpy as np\n## What you put in ...\nt4 = Table()\nt4['mixed'] = [\n    -1,0,1,  # regular integers\n    -12345678909876543211234567890987654321,  # very very large integer\n    None,np.nan,  # null values \n    \"one\", \"\",  # strings\n    True,False,  # booleans\n    float('inf'), 0.01,  # floats\n    date(2000,1,1),   # date\n    datetime(2002,2,3,23,0,4,6660),  # datetime\n    time(12,12,12),  # time\n    timedelta(days=3, seconds=5678)  # timedelta\n]\n## ... is exactly what you get out:\nt4\n</pre> ## should you want to mix datatypes, tablite will not complain: from datetime import datetime, date,time,timedelta import numpy as np ## What you put in ... t4 = Table() t4['mixed'] = [     -1,0,1,  # regular integers     -12345678909876543211234567890987654321,  # very very large integer     None,np.nan,  # null values      \"one\", \"\",  # strings     True,False,  # booleans     float('inf'), 0.01,  # floats     date(2000,1,1),   # date     datetime(2002,2,3,23,0,4,6660),  # datetime     time(12,12,12),  # time     timedelta(days=3, seconds=5678)  # timedelta ] ## ... is exactly what you get out: t4 Out[9]: #mixed 0-1 10 21 3-12345678909876543211234567890987654321 4None 5nan 6one 7 8True 9False10inf110.01122000-01-01132002-02-03 23:00:04.0066601412:12:12153 days, 1:34:38 In\u00a0[10]: Copied! <pre>## also if you claim the values back as a python list:\nfor item in list(t4['mixed']):\n    print(item)\n</pre> ## also if you claim the values back as a python list: for item in list(t4['mixed']):     print(item) <pre>-1\n0\n1\n-12345678909876543211234567890987654321\nNone\nnan\none\n\nTrue\nFalse\ninf\n0.01\n2000-01-01\n2002-02-03 23:00:04.006660\n12:12:12\n3 days, 1:34:38\n</pre> <p>The column itself (<code>__repr__</code>) shows us the <code>pid</code>, <code>file location</code> and the entries, so you know exactly what you're working with.</p> In\u00a0[11]: Copied! <pre>t4['mixed']\n</pre> t4['mixed'] Out[11]: <pre>Column(/tmp/tablite-tmp/pid-54911, [-1 0 1 -12345678909876543211234567890987654321 None nan 'one' '' True\n False inf 0.01 datetime.date(2000, 1, 1)\n datetime.datetime(2002, 2, 3, 23, 0, 4, 6660) datetime.time(12, 12, 12)\n datetime.timedelta(days=3, seconds=5678)])</pre> In\u00a0[12]: Copied! <pre>## to view the datatypes in a column, use Column.types()\ntype_dict = t4['mixed'].types()\nfor k,v in type_dict.items():\n    print(k,v)\n</pre> ## to view the datatypes in a column, use Column.types() type_dict = t4['mixed'].types() for k,v in type_dict.items():     print(k,v) <pre>&lt;class 'int'&gt; 4\n&lt;class 'NoneType'&gt; 1\n&lt;class 'float'&gt; 3\n&lt;class 'str'&gt; 2\n&lt;class 'bool'&gt; 2\n&lt;class 'datetime.date'&gt; 1\n&lt;class 'datetime.datetime'&gt; 1\n&lt;class 'datetime.time'&gt; 1\n&lt;class 'datetime.timedelta'&gt; 1\n</pre> In\u00a0[13]: Copied! <pre>## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file.\n## This is because tablite guesses the most probable datatype using the `.guess` function on each column.\n## You can use the .guess function like this:\nfrom tablite import DataTypes\nt3['a'] = DataTypes.guess(t3['a'])\n## You can also convert the datatype using a list comprehension\nt3['b'] = [float(v) for v in t3['b']]\nt3\n</pre> ## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file. ## This is because tablite guesses the most probable datatype using the `.guess` function on each column. ## You can use the .guess function like this: from tablite import DataTypes t3['a'] = DataTypes.guess(t3['a']) ## You can also convert the datatype using a list comprehension t3['b'] = [float(v) for v in t3['b']] t3 Out[13]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[14]: Copied! <pre>t = Table()\nfor column_name in 'abcde':\n    t[column_name] =[i for i in range(5)]\n</pre> t = Table() for column_name in 'abcde':     t[column_name] =[i for i in range(5)] <p>(2) we want to add two new columns using the functions:</p> In\u00a0[15]: Copied! <pre>def f1(a,b,c):\n    return a+b+c+1\ndef f2(b,c,d):\n    return b*c*d\n</pre> def f1(a,b,c):     return a+b+c+1 def f2(b,c,d):     return b*c*d <p>(3) and we want to compute two new columns <code>f</code> and <code>g</code>:</p> In\u00a0[16]: Copied! <pre>t.add_columns('f', 'g')\n</pre> t.add_columns('f', 'g') <p>(4) we can now use the filter, to iterate over the table, and add the values to the two new columns:</p> In\u00a0[17]: Copied! <pre>f,g=[],[]\nfor row in t['a', 'b', 'c', 'd'].rows:\n    a, b, c, d = row\n\n    f.append(f1(a, b, c))\n    g.append(f2(b, c, d))\nt['f'] = f\nt['g'] = g\n\nassert len(t) == 5\nassert list(t.columns) == list('abcdefg')\nt\n</pre> f,g=[],[] for row in t['a', 'b', 'c', 'd'].rows:     a, b, c, d = row      f.append(f1(a, b, c))     g.append(f2(b, c, d)) t['f'] = f t['g'] = g  assert len(t) == 5 assert list(t.columns) == list('abcdefg') t Out[17]: #abcdefg 00000010 11111141 22222278 3333331027 4444441364 <p>Take note that if your dataset is assymmetric, a warning will be show:</p> In\u00a0[18]: Copied! <pre>assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]})\nfor row in assymmetric_table.rows:\n    print(row)\n## warning at the bottom ---v\n</pre> assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]}) for row in assymmetric_table.rows:     print(row) ## warning at the bottom ---v <pre>[1, 1]\n[2, 2]\n[3, None]\n</pre> <pre>/home/bjorn/github/tablite/tablite/base.py:1188: UserWarning: Column b has length 2 / 3. None will appear as fill value.\n  warnings.warn(f\"Column {name} has length {len(column)} / {n_max}. None will appear as fill value.\")\n</pre> In\u00a0[19]: Copied! <pre>table7 = Table(columns={\n'A': [1,1,2,2,3,4],\n'B': [1,1,2,2,30,40],\n'C': [-1,-2,-3,-4,-5,-6]\n})\nindex = table7.index('A', 'B')\nfor k, v in index.items():\n    print(\"key\", k, \"indices\", v)\n</pre> table7 = Table(columns={ 'A': [1,1,2,2,3,4], 'B': [1,1,2,2,30,40], 'C': [-1,-2,-3,-4,-5,-6] }) index = table7.index('A', 'B') for k, v in index.items():     print(\"key\", k, \"indices\", v) <pre>key (1, 1) indices [0, 1]\nkey (2, 2) indices [2, 3]\nkey (3, 30) indices [4]\nkey (4, 40) indices [5]\n</pre> <p>The keys are created for each unique column-key-pair, and the value is the index where the key is found. To fetch all rows for key <code>(2,2)</code>, we can use:</p> In\u00a0[20]: Copied! <pre>for ix, row in enumerate(table7.rows):\n    if ix in index[(2,2)]:\n        print(row)\n</pre> for ix, row in enumerate(table7.rows):     if ix in index[(2,2)]:         print(row) <pre>[2, 2, -3]\n[2, 2, -4]\n</pre> In\u00a0[21]: Copied! <pre>## to append one table to another, use + or += \nprint('length before:', len(t3))  # length before: 45\nt5 = t3 + t3  \nprint('length after +', len(t5))  # length after + 90\nt5 += t3 \nprint('length after +=', len(t5))  # length after += 135\n## if you need a lot of numbers for a test, you can repeat a table using * and *=\nt5 *= 1_000\nprint('length after +=', len(t5))  # length after += 135000\n</pre> ## to append one table to another, use + or +=  print('length before:', len(t3))  # length before: 45 t5 = t3 + t3   print('length after +', len(t5))  # length after + 90 t5 += t3  print('length after +=', len(t5))  # length after += 135 ## if you need a lot of numbers for a test, you can repeat a table using * and *= t5 *= 1_000 print('length after +=', len(t5))  # length after += 135000 <pre>length before: 45\nlength after + 90\nlength after += 135\nlength after += 135000\n</pre> In\u00a0[22]: Copied! <pre>t5\n</pre> t5   Out[22]: #abcdef       010.0606060610.0909090910.1212121210.1515151520.181818182       120.1212121210.2424242420.4848484850.969696971.939393939       230.2424242420.4848484850.969696971.9393939393.878787879       340.4848484850.969696971.9393939393.8787878797.757575758       450.969696971.9393939393.8787878797.75757575815.51515152       561.9393939393.8787878797.75757575815.5151515231.03030303       673.8787878797.75757575815.5151515231.0303030362.06060606..................... 134,9933916659267088.033318534175.066637068350.0133274000000.0266548000000.0 134,9944033318534175.066637068350.0133274000000.0266548000000.0533097000000.0 134,9954166637068350.0133274000000.0266548000000.0533097000000.01066190000000.0 134,99642133274000000.0266548000000.0533097000000.01066190000000.02132390000000.0 134,99743266548000000.0533097000000.01066190000000.02132390000000.04264770000000.0 134,99844533097000000.01066190000000.02132390000000.04264770000000.08529540000000.0 134,999451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[23]: Copied! <pre>## if your are in doubt whether your tables will be the same you can use .stack(other)\nassert t.columns != t2.columns  # compares list of column names.\nt6 = t.stack(t2)\nt6\n</pre> ## if your are in doubt whether your tables will be the same you can use .stack(other) assert t.columns != t2.columns  # compares list of column names. t6 = t.stack(t2) t6 Out[23]: #abcdefgAB 00000010NoneNone 11111141NoneNone 22222278NoneNone 3333331027NoneNone 4444441364NoneNone 5NoneNoneNoneNoneNoneNoneNone1a 6NoneNoneNoneNoneNoneNoneNone2b 7NoneNoneNoneNoneNoneNoneNone3c In\u00a0[24]: Copied! <pre>## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns.\n\n## if you need a more detailed view of the columns you can iterate:\nfor name in t.columns:\n    col_from_t = t[name]\n    if name in t2.columns:\n        col_from_t2 = t2[name]\n        print(name, col_from_t == col_from_t2)\n    else:\n        print(name, \"not in t2\")\n</pre> ## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns.  ## if you need a more detailed view of the columns you can iterate: for name in t.columns:     col_from_t = t[name]     if name in t2.columns:         col_from_t2 = t2[name]         print(name, col_from_t == col_from_t2)     else:         print(name, \"not in t2\") <pre>a not in t2\nb not in t2\nc not in t2\nd not in t2\ne not in t2\nf not in t2\ng not in t2\n</pre> In\u00a0[25]: Copied! <pre>## to make a copy of a table, use table.copy()\nt3_copy = t3.copy()\n\n## you can also perform multi criteria selections using getitem [ ... ]\nt3_slice = t3['a','b','d', 5:25:5]\nt3_slice\n</pre> ## to make a copy of a table, use table.copy() t3_copy = t3.copy()  ## you can also perform multi criteria selections using getitem [ ... ] t3_slice = t3['a','b','d', 5:25:5] t3_slice Out[25]: #abd 061.9393939397.757575758 11162.06060606248.2424242 2161985.9393947943.757576 32163550.06061254200.2424 In\u00a0[26]: Copied! <pre>##deleting items also works the same way:\ndel t3_slice[1:3]  # delete row number 2 &amp; 3 \nt3_slice\n</pre> ##deleting items also works the same way: del t3_slice[1:3]  # delete row number 2 &amp; 3  t3_slice Out[26]: #abd 061.9393939397.757575758 12163550.06061254200.2424 In\u00a0[27]: Copied! <pre>## to wipe a table, use .clear:\nt3_slice.clear()\nt3_slice\n</pre> ## to wipe a table, use .clear: t3_slice.clear() t3_slice Out[27]: Empty Table In\u00a0[28]: Copied! <pre>## tablite uses .npy for storage because it is fast.\n## this means you can make a table persistent using .save\nlocal_file = Path(\"local_file.tpz\")\nt5.save(local_file)\n\nold_t5 = Table.load(local_file)\nprint(\"the t5 table had\", len(old_t5), \"rows\")  # the t5 table had 135000 rows\n\ndel old_t5  # only removes the in-memory object\n\nprint(\"old_t5 still exists?\", local_file.exists())\nprint(\"path:\", local_file)\n\nimport os\nos.remove(local_file)\n</pre> ## tablite uses .npy for storage because it is fast. ## this means you can make a table persistent using .save local_file = Path(\"local_file.tpz\") t5.save(local_file)  old_t5 = Table.load(local_file) print(\"the t5 table had\", len(old_t5), \"rows\")  # the t5 table had 135000 rows  del old_t5  # only removes the in-memory object  print(\"old_t5 still exists?\", local_file.exists()) print(\"path:\", local_file)  import os os.remove(local_file) <pre>loading 'local_file.tpz' file:  55%|\u2588\u2588\u2588\u2588\u2588\u258d    | 9851/18000 [00:02&lt;00:01, 4386.96it/s]</pre> <pre>loading 'local_file.tpz' file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18000/18000 [00:04&lt;00:00, 4417.27it/s]\n</pre> <pre>the t5 table had 135000 rows\nold_t5 still exists? True\npath: local_file.tpz\n</pre> <p>If you want to save a table from one session to another use <code>save=True</code>. This tells the garbage collector to leave the tablite Table on disk, so you can load it again without changing your code.</p> <p>For example:</p> <p>First time you run <code>t = Table.import_file(....big.csv)</code> it may take a minute or two.</p> <p>If you then add <code>t.save=True</code> and restart python, the second time you run  <code>t = Table.import_file(....big.csv)</code> it will take a few milliseconds instead of minutes.</p> In\u00a0[29]: Copied! <pre>unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]})\n</pre> unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]}) In\u00a0[30]: Copied! <pre>true,false = unfiltered.filter(\n    [\n        {\"column1\": 'a', \"criteria\":\"&gt;=\", 'value2':3}\n    ], filter_type='all'\n)\n</pre> true,false = unfiltered.filter(     [         {\"column1\": 'a', \"criteria\":\"&gt;=\", 'value2':3}     ], filter_type='all' ) In\u00a0[31]: Copied! <pre>true\n</pre> true Out[31]: #ab 0330 1440 In\u00a0[32]: Copied! <pre>false.show()  # using show here to show that terminal users can have a nice view too.\n</pre> false.show()  # using show here to show that terminal users can have a nice view too. <pre>+==+=+==+\n|# |a|b |\n+--+-+--+\n| 0|1|10|\n| 1|2|20|\n+==+=+==+\n</pre> In\u00a0[33]: Copied! <pre>ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]})\n</pre> ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]}) In\u00a0[34]: Copied! <pre>## typical python\nany(i &gt; 3 for i in ty['a'])\n</pre> ## typical python any(i &gt; 3 for i in ty['a']) Out[34]: <pre>True</pre> In\u00a0[35]: Copied! <pre>## hereby you can do:\nany( ty.any(**{'a':lambda x:x&gt;3}).rows )\n</pre> ## hereby you can do: any( ty.any(**{'a':lambda x:x&gt;3}).rows ) Out[35]: <pre>True</pre> In\u00a0[36]: Copied! <pre>## if you have multiple criteria this also works:\nall( ty.all(**{'a': lambda x:x&gt;=2, 'b': lambda x:x&lt;=30}).rows )\n</pre> ## if you have multiple criteria this also works: all( ty.all(**{'a': lambda x:x&gt;=2, 'b': lambda x:x&lt;=30}).rows ) Out[36]: <pre>True</pre> In\u00a0[37]: Copied! <pre>## or this if you want to see the table.\nty.all(a=lambda x:x&gt;2, b=lambda x:x&lt;=30)\n</pre> ## or this if you want to see the table. ty.all(a=lambda x:x&gt;2, b=lambda x:x&lt;=30) Out[37]: #ab 0330 In\u00a0[38]: Copied! <pre>## As `all` and `any` returns tables, this also means that you can chain operations:\nty.any(a=lambda x:x&gt;2).any(b=30)\n</pre> ## As `all` and `any` returns tables, this also means that you can chain operations: ty.any(a=lambda x:x&gt;2).any(b=30) Out[38]: #ab 0330 In\u00a0[39]: Copied! <pre>table = Table({\n    'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9],\n    'B':[10,'100', 1, 1, 1, 1, 10, 10, 10],\n    'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0],\n})\ntable\n</pre> table = Table({     'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9],     'B':[10,'100', 1, 1, 1, 1, 10, 10, 10],     'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0], }) table Out[39]: #ABC 01100 1None1001 2810 3311 4410 5611 65100 77101 89100 In\u00a0[40]: Copied! <pre>sort_order = {'B': False, 'C': False, 'A': False}\nassert not table.is_sorted(mapping=sort_order)\n\nsorted_table = table.sort(mapping=sort_order)\nsorted_table\n</pre> sort_order = {'B': False, 'C': False, 'A': False} assert not table.is_sorted(mapping=sort_order)  sorted_table = table.sort(mapping=sort_order) sorted_table <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 2719.45it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 3434.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 1902.47it/s]\n</pre> <p>Sort is reasonable effective as it uses multiprocessing above a million fields.</p> <p>Hint: You can set this limit in <code>tablite.config</code>, like this:</p> In\u00a0[41]: Copied! <pre>from tablite.config import Config\nprint(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\")\n</pre> from tablite.config import Config print(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\") <pre>multiprocessing is used above 1,000,000 fields\n</pre> In\u00a0[42]: Copied! <pre>import math\nn = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n    'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,\n    'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n    'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n,\n})\ntable\n</pre> import math n = math.ceil(1_000_000 / (9*3))  table = Table({     'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,     'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,     'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n, }) table Out[42]: #ABC       01100       1None1001       2810       3311       4410       5611       65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[43]: Copied! <pre>import time as cputime\nstart = cputime.time()\nsort_order = {'B': False, 'C': False, 'A': False}\nsorted_table = table.sort(mapping=sort_order)  # sorts 1M values.\nprint(\"table sorting took \", round(cputime.time() - start,3), \"secs\")\nsorted_table\n</pre> import time as cputime start = cputime.time() sort_order = {'B': False, 'C': False, 'A': False} sorted_table = table.sort(mapping=sort_order)  # sorts 1M values. print(\"table sorting took \", round(cputime.time() - start,3), \"secs\") sorted_table <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00,  4.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 18.17it/s]</pre> <pre>table sorting took  0.913 secs\n</pre> <pre>\n</pre> In\u00a0[44]: Copied! <pre>n = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n    'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,\n    'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n    'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n,\n})\ntable\n</pre> n = math.ceil(1_000_000 / (9*3))  table = Table({     'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,     'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,     'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n, }) table  Out[44]: #ABC       01100       1None1001       2810       3311       4410       5611       65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[45]: Copied! <pre>from tablite import GroupBy as gb\ngrpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)])\ngrpby\n</pre> from tablite import GroupBy as gb grpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)]) grpby <pre>groupby: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 333342/333342 [00:00&lt;00:00, 427322.50it/s]\n</pre> Out[45]: #CBCount(A) 0010111114 1110037038 20174076 31174076 411037038 <p>Here is the list of groupby functions:</p> <pre><code>class GroupBy(object):    \n    max = Max  # shortcuts to avoid having to type a long list of imports.\n    min = Min\n    sum = Sum\n    product = Product\n    first = First\n    last = Last\n    count = Count\n    count_unique = CountUnique\n    avg = Average\n    stdev = StandardDeviation\n    median = Median\n    mode = Mode\n</code></pre> In\u00a0[46]: Copied! <pre>t = Table({\n    'A':[1, 1, 2, 2, 3, 3] * 2,\n    'B':[1, 2, 3, 4, 5, 6] * 2,\n    'C':[6, 5, 4, 3, 2, 1] * 2,\n})\nt\n</pre> t = Table({     'A':[1, 1, 2, 2, 3, 3] * 2,     'B':[1, 2, 3, 4, 5, 6] * 2,     'C':[6, 5, 4, 3, 2, 1] * 2, }) t Out[46]: #ABC 0116 1125 2234 3243 4352 5361 6116 7125 8234 92431035211361 In\u00a0[47]: Copied! <pre>t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False)\nt2\n</pre> t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False) t2 <pre>pivot: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 14/14 [00:00&lt;00:00, 3643.83it/s]\n</pre> Out[47]: #CSum(B,A=1)Count(B,A=1)Sum(B,A=2)Count(B,A=2)Sum(B,A=3)Count(B,A=3) 0622NoneNoneNoneNone 1542NoneNoneNoneNone 24NoneNone62NoneNone 33NoneNone82NoneNone 42NoneNoneNoneNone102 51NoneNoneNoneNone122 In\u00a0[48]: Copied! <pre>numbers = Table()\nnumbers.add_column('number', data=[      1,      2,       3,       4,   None])\nnumbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue'])\n\nletters = Table()\nletters.add_column('letter', data=[  'a',     'b',      'c',     'd',   None])\nletters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue'])\n</pre> numbers = Table() numbers.add_column('number', data=[      1,      2,       3,       4,   None]) numbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue'])  letters = Table() letters.add_column('letter', data=[  'a',     'b',      'c',     'd',   None]) letters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue'])  In\u00a0[49]: Copied! <pre>## left join\n## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nleft_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nleft_join\n</pre> ## left join ## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color left_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) left_join <pre>join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1221.94it/s]\n</pre> Out[49]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d In\u00a0[50]: Copied! <pre>## inner join\n## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\ninner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\ninner_join\n</pre> ## inner join ## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color inner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) inner_join <pre>join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1121.77it/s]\n</pre> Out[50]: #numberletter 02a 12None 2Nonea 3NoneNone 43b 53d 64b 74d In\u00a0[51]: Copied! <pre># outer join\n## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nouter_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nouter_join\n</pre>  # outer join ## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color outer_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) outer_join <pre>join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1585.15it/s]\n</pre> Out[51]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d 9Nonec <p>Q: But ...I think there's a bug in the join... A: Venn diagrams do not explain joins.</p> <p>A Venn diagram is a widely-used diagram style that shows the logical relation between sets, popularised by John Venn in the 1880s. The diagrams are used to teach elementary set theory, and to illustrate simple set relationshipssource: en.wikipedia.org</p> <p>Joins operate over rows and when there are duplicate rows, these will be replicated in the output. Many beginners are surprised by this, because they didn't read the SQL standard.</p> <p>Q: So what do I do? A: If you want to get rid of duplicates using tablite, use the <code>index</code> functionality across all columns and pick the first row from each index. Here's the recipe that starts with plenty of duplicates:</p> In\u00a0[52]: Copied! <pre>old_table = Table({\n'A':[1,1,1,2,2,2,3,3,3],\n'B':[1,1,4,2,2,5,3,3,6],\n})\nold_table\n</pre> old_table = Table({ 'A':[1,1,1,2,2,2,3,3,3], 'B':[1,1,4,2,2,5,3,3,6], }) old_table Out[52]: #AB 011 111 214 322 422 525 633 733 836 In\u00a0[53]: Copied! <pre>## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE)\nnew_table = old_table.drop_duplicates()\nnew_table\n</pre> ## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE) new_table = old_table.drop_duplicates() new_table <pre>9it [00:00, 11329.15it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1819.26it/s]\n</pre> Out[53]: #AB 011 114 222 325 433 536 <p>You can also use groupby; We'll get to that in a minute.</p> <p>Lookup is a special case of a search loop: Say for example you are planning a concert and want to make sure that your friends can make it home using public transport: You would have to find the first departure after the concert ends towards their home. A join would only give you a direct match on the time.</p> <p>Lookup allows you \"to iterate through a list of data and find the first match given a set of criteria.\"</p> <p>Here's an example:</p> <p>First we have our list of friends and their stops.</p> In\u00a0[54]: Copied! <pre>friends = Table({\n\"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'],\n\"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'],\n})\nfriends\n</pre> friends = Table({ \"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'], \"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'], }) friends Out[54]: #namestop 0AliceDowntown-1 1BettyDowntown-2 2CharlieHillside View 3DorethyHillside Crescent 4EdwardDowntown-2 5FredChicago <p>Next we need a list of bus routes and their time and stops. I don't have that, so I'm making one up:</p> In\u00a0[55]: Copied! <pre>import random\nrandom.seed(11)\ntable_size = 40\n\ntimes = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)]\nstops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2',\n            'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)]\nroute = [random.choice([1, 2, 3]) for i in stops]\n</pre>  import random random.seed(11) table_size = 40  times = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)] stops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2',             'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)] route = [random.choice([1, 2, 3]) for i in stops]  In\u00a0[56]: Copied! <pre>bus_table = Table({\n\"time\":times,\n\"stop\":stops[:table_size],\n\"route\":route[:table_size],\n})\nbus_table.sort(mapping={'time': False})\n\nprint(\"Departures from Concert Hall towards ...\")\nbus_table[0:10]\n</pre>  bus_table = Table({ \"time\":times, \"stop\":stops[:table_size], \"route\":route[:table_size], }) bus_table.sort(mapping={'time': False})  print(\"Departures from Concert Hall towards ...\") bus_table[0:10]  <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 1459.90it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 2421.65it/s]\n</pre> <pre>Departures from Concert Hall towards ...\n</pre> Out[56]: #timestoproute 021:02:00Random Road-62 121:05:00Hillside Crescent2 221:06:00Hillside1 321:25:00Random Road-241 421:29:00Random Road-161 521:32:00Random Road-211 621:33:00Random Road-121 721:36:00Random Road-233 821:38:00Central station2 921:38:00Random Road-82 <p>Let's say the concerts ends at 21:00 and it takes a 10 minutes to get to the bus-stop. Earliest departure must then be 21:10 - goodbye hugs included.</p> In\u00a0[57]: Copied! <pre>lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"&lt;=\", 'time'), ('stop', \"==\", 'stop'))\nlookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix')\nlookup1_sorted\n</pre> lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"&lt;=\", 'time'), ('stop', \"==\", 'stop')) lookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix') lookup1_sorted <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 1513.92it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 2003.65it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 2589.88it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:00&lt;00:00, 2034.29it/s]\n</pre> Out[57]: #namestoptimestop_1route 0FredChicagoNoneNoneNone 1BettyDowntown-221:51:00Downtown-21 2EdwardDowntown-221:51:00Downtown-21 3CharlieHillside View22:19:00Hillside View2 4AliceDowntown-123:12:00Downtown-13 5DorethyHillside Crescent23:54:00Hillside Crescent1 <p>Lookup's ability to custom criteria is thereby far more versatile than SQL joins.</p> <p>But with great power comes great responsibility.</p> In\u00a0[58]: Copied! <pre>materials = Table({\n    'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9], \n    'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6], \n    'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'], \n    'material_id': [None, None, None, 3, None, 5, 3, 3, 5], \n    'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90]\n})\n    # 9 is a partially packed pallet of 6\n\n## multiple values.\nlooking_for = Table({\n    'bom_id': [3,4,6], \n    'moq': [1,2,3]\n    })\n</pre> materials = Table({     'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9],      'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6],      'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'],      'material_id': [None, None, None, 3, None, 5, 3, 3, 5],      'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90] })     # 9 is a partially packed pallet of 6  ## multiple values. looking_for = Table({     'bom_id': [3,4,6],      'moq': [1,2,3]     })  <p>Our goals is now to find the quantity from the <code>materials</code> table based on the items in the <code>looking_for</code> table.</p> <p>This requires two steps:</p> <ol> <li>lookup</li> <li>filter for <code>all</code> by dropping items that didn't match.</li> </ol> In\u00a0[59]: Copied! <pre>## step 1/2:\nproducts_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False)   \nproducts_lookup\n</pre> ## step 1/2: products_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False)    products_lookup <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 9/9 [00:00&lt;00:00, 3651.81it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1625.38it/s]\n</pre> Out[59]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 011ANone10NoneNone 122irrelevantNone20NoneNone 233empty cartonNone3031 344pkd carton34042 455empty palletNone50NoneNone 566pkd pallet56063 677pkd irrelevant370NoneNone 784ppkd carton38042 896ppkd pallet59063 In\u00a0[60]: Copied! <pre>## step 2/2:\nproducts = products_lookup.all(bom_id_1=lambda x: x is not None)\nproducts\n</pre> ## step 2/2: products = products_lookup.all(bom_id_1=lambda x: x is not None) products Out[60]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 <p>The faster way to solve this problem is to use <code>match</code>!</p> <p>Here is the example:</p> In\u00a0[61]: Copied! <pre>products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"))\nproducts_matched\n</pre> products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\")) products_matched Out[61]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 In\u00a0[62]: Copied! <pre>assert products == products_matched\n</pre> assert products == products_matched In\u00a0[63]: Copied! <pre>from tablite import Table\nt = Table()  # create table\nt.add_columns('row','A','B','C')  # add columns\n</pre> from tablite import Table t = Table()  # create table t.add_columns('row','A','B','C')  # add columns <p>The following examples are all valid and append the row (1,2,3) to the table.</p> In\u00a0[64]: Copied! <pre>t.add_rows(1, 1, 2, 3)  # individual values\nt.add_rows([2, 1, 2, 3])  # list of values\nt.add_rows((3, 1, 2, 3))  # tuple of values\nt.add_rows(*(4, 1, 2, 3))  # unpacked tuple\nt.add_rows(row=5, A=1, B=2, C=3)   # keyword - args\nt.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})  # dict / json.\n</pre> t.add_rows(1, 1, 2, 3)  # individual values t.add_rows([2, 1, 2, 3])  # list of values t.add_rows((3, 1, 2, 3))  # tuple of values t.add_rows(*(4, 1, 2, 3))  # unpacked tuple t.add_rows(row=5, A=1, B=2, C=3)   # keyword - args t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})  # dict / json. <p>The following examples add two rows to the table</p> In\u00a0[65]: Copied! <pre>t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))  # two (or more) tuples.\nt.add_rows([9, 1, 2, 3], [10, 4, 5, 6])  # two or more lists\nt.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3},\n          {'row': 12, 'A': 4, 'B': 5, 'C': 6})  # two (or more) dicts as args.\nt.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3},\n            {'row': 14, 'A': 1, 'B': 2, 'C': 3}])  # list of dicts.\n</pre> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))  # two (or more) tuples. t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])  # two or more lists t.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3},           {'row': 12, 'A': 4, 'B': 5, 'C': 6})  # two (or more) dicts as args. t.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3},             {'row': 14, 'A': 1, 'B': 2, 'C': 3}])  # list of dicts. In\u00a0[66]: Copied! <pre>t\n</pre> t Out[66]: #rowABC 01123 12123 23123 34123 45123 56123 67123 78456 89123 9104561011123111245612131231314123 <p>As the row incremented from <code>1</code> in the first of these examples, and finished with <code>row: 14</code>, you can now see the whole table above</p> In\u00a0[67]: Copied! <pre>from pathlib import Path\npath = Path('tests/data/book1.csv')\ntx = Table.from_file(path)\ntx\n</pre> from pathlib import Path path = Path('tests/data/book1.csv') tx = Table.from_file(path) tx  <pre>Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n</pre> <pre>importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 444.08it/s]\n</pre> Out[67]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 <p>Note that you can also add start, limit and chunk_size to the file reader. Here's an example:</p> In\u00a0[68]: Copied! <pre>path = Path('tests/data/book1.csv')\ntx2 = Table.from_file(path, start=2, limit=15)\ntx2\n</pre> path = Path('tests/data/book1.csv') tx2 = Table.from_file(path, start=2, limit=15) tx2 <pre>Collecting tasks: 'tests/data/book1.csv'\n</pre> <pre>importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 391.22it/s]</pre> <pre>Dumping tasks: 'tests/data/book1.csv'\n</pre> <pre>\n</pre> Out[68]: #abcdef 030.2424242420.4848484850.969696971.9393939393.878787879 140.4848484850.969696971.9393939393.8787878797.757575758 250.969696971.9393939393.8787878797.75757575815.51515152 361.9393939393.8787878797.75757575815.5151515231.03030303 473.8787878797.75757575815.5151515231.0303030362.06060606 587.75757575815.5151515231.0303030362.06060606124.1212121 6915.5151515231.0303030362.06060606124.1212121248.2424242 71031.0303030362.06060606124.1212121248.2424242496.4848485 81162.06060606124.1212121248.2424242496.4848485992.969697 912124.1212121248.2424242496.4848485992.9696971985.9393941013248.2424242496.4848485992.9696971985.9393943971.8787881114496.4848485992.9696971985.9393943971.8787887943.7575761215992.9696971985.9393943971.8787887943.75757615887.5151513161985.9393943971.8787887943.75757615887.5151531775.030314173971.8787887943.75757615887.5151531775.030363550.06061 <p>How good is the file_reader?</p> <p>I've included all formats in the test suite that are publicly available from the Alan Turing institute, dateutils) and Python's csv reader.</p> <p>What about <code>MM-DD-YYYY</code> formats? Some users from the US ask why the csv reader doesn't read the month-day-year format.</p> <p>The answer is simple: It's not an iso8601 format. The US month-day-year format is a locale that may be used a lot in the US, but it isn't an international standard.</p> <p>If you need to work with <code>MM-DD-YYYY</code> you will find that the file_reader will import the values as text (str). You can then reformat it with a custom function like:</p> In\u00a0[69]: Copied! <pre>s = \"03-21-1998\"\nfrom datetime import date\nf = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5]))\nf(s)\n</pre> s = \"03-21-1998\" from datetime import date f = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5])) f(s) Out[69]: <pre>datetime.date(1998, 3, 21)</pre> In\u00a0[70]: Copied! <pre>from tablite.import_utils import file_readers\nfor k,v in file_readers.items():\n    print(k,v)\n</pre> from tablite.import_utils import file_readers for k,v in file_readers.items():     print(k,v) <pre>fods &lt;function excel_reader at 0x7f36a3ef8c10&gt;\njson &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nhtml &lt;function from_html at 0x7f36a3ef8b80&gt;\nhdf5 &lt;function from_hdf5 at 0x7f36a3ef8a60&gt;\nsimple &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nrst &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nmediawiki &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nxlsx &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nxls &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nxlsm &lt;function excel_reader at 0x7f36a3ef8c10&gt;\ncsv &lt;function text_reader at 0x7f36a3ef9000&gt;\ntsv &lt;function text_reader at 0x7f36a3ef9000&gt;\ntxt &lt;function text_reader at 0x7f36a3ef9000&gt;\nods &lt;function ods_reader at 0x7f36a3ef8ca0&gt;\n</pre> <p>(2) define your new file reader</p> In\u00a0[71]: Copied! <pre>def my_magic_reader(path, **kwargs):   # define your new file reader.\n    print(\"do magic with {path}\")\n    return\n</pre> def my_magic_reader(path, **kwargs):   # define your new file reader.     print(\"do magic with {path}\")     return <p>(3) add it to the list of readers.</p> In\u00a0[72]: Copied! <pre>file_readers['my_special_format'] = my_magic_reader\n</pre> file_readers['my_special_format'] = my_magic_reader <p>The <code>file_readers</code> are all in tablite.core so if you intend to extend the readers, I recommend that you start here.</p> In\u00a0[73]: Copied! <pre>file = Path('example.xlsx')\ntx2.to_xlsx(file)\nos.remove(file)\n</pre> file = Path('example.xlsx') tx2.to_xlsx(file) os.remove(file) <p></p> In\u00a0[74]: Copied! <pre>from tablite import Table\n\nt = Table({\n'a':[1, 2, 8, 3, 4, 6, 5, 7, 9],\n'b':[10, 100, 3, 4, 16, -1, 10, 10, 10],\n})\nt.sort(mapping={\"a\":False})\nt\n</pre> from tablite import Table  t = Table({ 'a':[1, 2, 8, 3, 4, 6, 5, 7, 9], 'b':[10, 100, 3, 4, 16, -1, 10, 10, 10], }) t.sort(mapping={\"a\":False}) t <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 1674.37it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1701.89it/s]\n</pre> Out[74]: #ab 0110 12100 234 3416 4510 56-1 6710 783 8910 In\u00a0[75]: Copied! <pre>%pip install matplotlib -q\n</pre> %pip install matplotlib -q <pre>Note: you may need to restart the kernel to use updated packages.\n</pre> In\u00a0[76]: Copied! <pre>import matplotlib.pyplot as plt\nplt.plot(t['a'], t['b'])\nplt.ylabel('Hello Figure')\nplt.show()\n</pre> import matplotlib.pyplot as plt plt.plot(t['a'], t['b']) plt.ylabel('Hello Figure') plt.show() In\u00a0[77]: Copied! <pre>## Let's monitor the memory and record the observations into a table!\nimport psutil, os, gc\nfrom time import process_time,sleep\nprocess = psutil.Process(os.getpid())\n\ndef mem_time():  # go and check taskmanagers memory usage.\n    return process.memory_info().rss, process_time()\n\ndigits = 1_000_000\n\nrecords = Table({'method':[], 'memory':[], 'time':[]})\n</pre> ## Let's monitor the memory and record the observations into a table! import psutil, os, gc from time import process_time,sleep process = psutil.Process(os.getpid())  def mem_time():  # go and check taskmanagers memory usage.     return process.memory_info().rss, process_time()  digits = 1_000_000  records = Table({'method':[], 'memory':[], 'time':[]})  <p>The row based format: 1 million 10-tuples</p> In\u00a0[78]: Copied! <pre>before, start = mem_time()\nL = [tuple([11 for _ in range(10)]) for _ in range(digits)]\nafter, end = mem_time()  \ndel L\ngc.collect()\n\nrecords.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4)))\nrecords\n</pre> before, start = mem_time() L = [tuple([11 for _ in range(10)]) for _ in range(digits)] after, end = mem_time()   del L gc.collect()  records.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4))) records Out[78]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 <p>The column based format: 10 columns with 1M values:</p> In\u00a0[79]: Copied! <pre>before, start = mem_time()\nL = [[11 for i2 in range(digits)] for i1 in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4)))\n</pre> before, start = mem_time() L = [[11 for i2 in range(digits)] for i1 in range(10)] after,end = mem_time()  del L gc.collect() records.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4))) <p>We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists.</p> <p>Q: But why didn't I just use an array? It would have even lower memory footprint.</p> <p>A: First, array's don't handle None's and we get that frequently in dirty csv data.</p> <p>Second, Table needs even less memory.</p> <p>Let's try with an array:</p> In\u00a0[80]: Copied! <pre>import array\n\nbefore, start = mem_time()\nL = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4)))\nrecords\n</pre> import array  before, start = mem_time() L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)] after,end = mem_time()  del L gc.collect() records.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4))) records Out[80]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 <p>Finally let's use a <code>tablite.Table</code>:</p> In\u00a0[81]: Copied! <pre>before,start = mem_time()\nt = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)})\nafter,end = mem_time()\n\nrecords.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4)))\n\nbefore,start = mem_time()\nt2 = t.copy()\nafter,end = mem_time()\n\nrecords.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4)))\n\n## Let's show it, so we know nobody's cheating:\nt2\n</pre> before,start = mem_time() t = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)}) after,end = mem_time()  records.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4)))  before,start = mem_time() t2 = t.copy() after,end = mem_time()  records.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4)))  ## Let's show it, so we know nobody's cheating: t2  Out[81]: #0123456789        011111111111111111111        111111111111111111111        211111111111111111111        311111111111111111111        411111111111111111111        511111111111111111111        611111111111111111111.................................  999,99311111111111111111111  999,99411111111111111111111  999,99511111111111111111111  999,99611111111111111111111  999,99711111111111111111111  999,99811111111111111111111  999,99911111111111111111111 In\u00a0[82]: Copied! <pre>records\n</pre> records Out[82]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 3Table with 10 columns with 1e6 integers01.9569 42 Tables with 10 columns with 1e6 integers each00.0001 <p>Conclusion: whilst the common worst case (1M lists with 10 integers) take up 118 Mb of RAM, Tablite's tables vanish in the noise of memory measurement.</p> <p>Pandas also permits the usage of namedtuples, which are unpacked upon entry.</p> <pre>from collections import namedtuple\nPoint = namedtuple(\"Point\", \"x y\")\npoints = [Point(0, 0), Point(0, 3)]\npd.DataFrame(points)\n</pre> <p>Doing that in tablite is a bit different. To unpack the named tuple, you should do so explicitly:</p> <pre>t = Table({'x': [p.x for p in points], 'y': [p.y for p in points]})\n</pre> <p>However should you want to keep the points as namedtuple, you can do so in tablite:</p> <pre>t = Table()\nt['points'] = points\n</pre> <p>Tablite will store a serialised version of the points, so your memory overhead will be close to zero.</p>"},{"location":"tutorial/#tablite","title":"Tablite\u00b6","text":""},{"location":"tutorial/#introduction","title":"Introduction\u00b6","text":"<p>Tablite fills the data-science space where incremental data processing based on:</p> <ul> <li>Datasets are larger than memory.</li> <li>You don't want to worry about datatypes.</li> </ul> <p>Tablite thereby competes with:</p> <ul> <li>Pandas, but saves the memory overhead.</li> <li>Numpy, but spares you from worrying about lower level data types</li> <li>SQlite, by sheer speed.</li> <li>Polars, by working beyond RAM.</li> <li>Other libraries for data cleaning thanks to tablites powerful <code>datatypes</code> module.</li> </ul> <p>Install: <code>pip install tablite</code></p> <p>Usage:  <code>&gt;&gt;&gt; from tablite import Table</code></p> <p>Upgrade: <code>pip install tablite --no-cache --upgrade</code></p>"},{"location":"tutorial/#overview","title":"Overview\u00b6","text":"<p>(Version 2023.6.0 and later. For older version see this)</p> <ul> <li>Tablite handles all Python datatypes: <code>str</code>, <code>float</code>, <code>bool</code>, <code>int</code>, <code>date</code>, <code>datetime</code>, <code>time</code>, <code>timedelta</code> and <code>None</code>.</li> <li>you can select:<ul> <li>all rows in a column as <code>table['A']</code></li> <li>rows across all columns as <code>table[4:8]</code></li> <li>or a slice as <code>table['A', 'B', slice(4,8) ]</code>.</li> </ul> </li> <li>you to update with <code>table['A'][2] = new value</code></li> <li>you can store or send data using json, by:<ul> <li>dumping to json: <code>json_str = table.to_json()</code>, or</li> <li>you can load it with <code>Table.from_json(json_str)</code>.</li> </ul> </li> <li>you can iterate over rows using <code>for row in Table.rows</code>.</li> <li>you can ask <code>column_xyz in Table.colums</code> ?</li> <li>load from files with <code>new_table = Table.from_file('this.csv')</code> which has automatic datatype detection</li> <li>perform inner, outer &amp; left sql join between tables as simple as <code>table_1.inner_join(table2, keys=['A', 'B'])</code></li> <li>summarise using <code>table.groupby( ... )</code></li> <li>create pivot tables using <code>groupby.pivot( ... )</code></li> <li>perform multi-criteria lookup in tables using <code>table1.lookup(table2, criteria=.....</code></li> <li>and of course a large selection of tools in <code>from tablite.tools import *</code></li> </ul>"},{"location":"tutorial/#examples","title":"Examples\u00b6","text":"<p>Here are some examples:</p>"},{"location":"tutorial/#api-examples","title":"API Examples\u00b6","text":"<p>In the following sections, example are given of the Tablite API's power features:</p> <ul> <li>Iteration</li> <li>Append</li> <li>Sort</li> <li>Filter</li> <li>Index</li> <li>Search All</li> <li>Search Any</li> <li>Lookup</li> <li>Join inner, outer,</li> <li>GroupBy</li> <li>Pivot table</li> </ul>"},{"location":"tutorial/#iteration","title":"ITERATION!\u00b6","text":"<p>Iteration supports for loops and list comprehension at the speed of light:</p> <p>Just use <code>[r for r in table.rows]</code>, or:</p> <pre><code>for row in table.rows:\n    row ...</code></pre> <p>Here's a more practical use case:</p> <p>(1) Imagine a table with columns a,b,c,d,e (all integers) like this:</p>"},{"location":"tutorial/#create-index-indices","title":"Create Index / Indices\u00b6","text":"<p>Index supports multi-key indexing using args such as: <code>index = table.index('B','C')</code>.</p> <p>Here's an example:</p>"},{"location":"tutorial/#append","title":"APPEND\u00b6","text":""},{"location":"tutorial/#save","title":"SAVE\u00b6","text":""},{"location":"tutorial/#filter","title":"FILTER!\u00b6","text":""},{"location":"tutorial/#any-all","title":"Any! All?\u00b6","text":"<p>Any and All are cousins of the filter. They're there so you can use them in the same way as you'd use <code>any</code> and <code>all</code> in python - as boolean evaluators:</p>"},{"location":"tutorial/#sort","title":"SORT!\u00b6","text":""},{"location":"tutorial/#groupby","title":"GROUPBY !\u00b6","text":""},{"location":"tutorial/#did-i-say-pivot-table-yes","title":"Did I say pivot table? Yes.\u00b6","text":"<p>Pivot Table is included in the groupby functionality - so yes - you can pivot the groupby on any column that is used for grouping. Here's a simple example:</p>"},{"location":"tutorial/#join","title":"JOIN!\u00b6","text":""},{"location":"tutorial/#lookup","title":"LOOKUP!\u00b6","text":""},{"location":"tutorial/#match","title":"Match\u00b6","text":"<p>If you're looking to do a join where you afterwards remove the empty rows, <code>match</code> is the faster choice.</p> <p>Here is an example.</p> <p>Let's start with two tables:</p>"},{"location":"tutorial/#are-there-other-ways-i-can-add-data","title":"Are there other ways I can add data?\u00b6","text":"<p>Yes - but row based operations cause a lot of IO, so it'll work but be slower:</p>"},{"location":"tutorial/#okay-great-how-do-i-load-data","title":"Okay, great. How do I load data?\u00b6","text":"<p>Easy. Use <code>file_reader</code>. Here's an example:</p>"},{"location":"tutorial/#sweet-what-formats-are-supported-can-i-add-my-own-file-reader","title":"Sweet. What formats are supported? Can I add my own file reader?\u00b6","text":"<p>Yes! This is very good for special log files or custom json formats. Here's how you do it:</p> <p>(1) Go to all existing readers in the <code>tablite.core</code> and find the closest match.</p>"},{"location":"tutorial/#very-nice-how-about-exporting-data","title":"Very nice. How about exporting data?\u00b6","text":"<p>Just use .export</p>"},{"location":"tutorial/#cool-does-it-play-well-with-plotting-packages","title":"Cool. Does it play well with plotting packages?\u00b6","text":"<p>Yes. Here's an example you can copy and paste:</p>"},{"location":"tutorial/#i-like-sql-can-tablite-understand-sql","title":"I like sql. Can tablite understand SQL?\u00b6","text":"<p>Almost. You can use <code>table.to_sql</code> and tablite will return ANSI-92 compliant SQL.</p> <p>You can also create a table using <code>Table.from_sql</code> and tablite will consume ANSI-92 compliant SQL.</p>"},{"location":"tutorial/#but-what-do-i-do-if-im-about-to-run-out-of-memory","title":"But what do I do if I'm about to run out of memory?\u00b6","text":"<p>You wont. Every tablite table is backed by disk. The memory footprint of a table is only the metadata required to know the relationships between variable names and the datastructures.</p> <p>Let's do a comparison:</p>"},{"location":"tutorial/#conclusions","title":"Conclusions\u00b6","text":"<p>This concludes the mega-tutorial to <code>tablite</code>. There's nothing more to it. But oh boy it'll save a lot of time.</p> <p>Here's a summary of features:</p> <ul> <li>Everything a list can do.</li> <li>import csv*, fods, json, html, simple, rst, mediawiki, xlsx, xls, xlsm, csv, tsv, txt, ods using <code>Table.from_file(...)</code></li> <li>Iterate over rows or columns</li> <li>Create multikey <code>index</code>, <code>sort</code>, use <code>filter</code>, <code>any</code> and <code>all</code> to select. Perform <code>lookup</code> across tables including using custom functions.</li> <li>Perform multikey <code>joins</code> with other tables.</li> <li>Perform <code>groupby</code> and reorganise data as a <code>pivot</code> table with max, min, sum, first, last, count, unique, average, standard deviation, median and mode.</li> <li>Update tables with <code>+=</code> which automatically sorts out the columns - even if they're not in perfect order.</li> </ul>"},{"location":"tutorial/#faq","title":"FAQ\u00b6","text":"Question Answer I'm not in a notebook. Is there a nice way to view tables? Yes. <code>table.show()</code> prints the ascii version I'm looking for the equivalent to <code>apply</code> in pandas. Just use list comprehensions: <code>table[column] = [f(x) for x in table[column]</code> What about <code>map</code>? Just use the python function: <code>mapping = map(f, table[column name])</code> Is there a <code>where</code> function? It's called <code>any</code> or <code>all</code> like in python: <code>table.any(column_name &gt; 0)</code>. I like sql and sqlite. Can I use sql? Yes. Call <code>table.to_sql()</code> returns ANSI-92 SQL compliant table definition.You can use this in any SQL compliant engine. <p>| sometimes i need to clean up data with datetimes. Is there any tool to help with that? | Yes. Look at DataTypes.<code>DataTypes.round(value, multiple)</code> allows rounding of datetime.</p>"},{"location":"tutorial/#coming-to-tablite-from-pandas","title":"Coming to Tablite from Pandas\u00b6","text":"<p>If you're coming to Tablite from Pandas you will notice some differences.</p> <p>Here's the ultra short comparison to the documentation from Pandas called 10 minutes intro to pandas</p> <p>The tutorials provide the generic overview:</p> <ul> <li>pandas tutorial</li> <li>tablite tutorial</li> </ul> <p>Some key differences</p> topic Tablite Viewing data Just use <code>table.show()</code> in print outs, or if you're in a jupyter notebook just use the variable name <code>table</code> Selection Slicing works both on columns and rows, and you can filter using <code>any</code> or <code>all</code>:<code>table['A','B', 2:30:3].any(A=lambda x:x&gt;3)</code> to copy a table use: <code>t2 = t.copy()</code>This is a very fast deep copy, that has no memory overhead as tablites memory manager keeps track of the data. Missing data Tablite uses <code>mixed</code> column format for any format that isn't uniformTo get rid of rows with <code>None</code>s and <code>np.nan</code>s use any:<code>table.drop_na(None, np.nan)</code> Alternatively you can use replace: <code>table.replace(None,5)</code> following the syntax: <code>table.replace_missing_values(sources, target)</code> Operations Descriptive statistics are on a colum by column basis:<code>table['a'].statistics()</code>  the pandas function <code>df.apply</code> doesn't exist in tablite. Use a list comprehension instead. For example: <code>df.apply(np.cumsum)</code> is just <code>np.cumsum(t['A'])</code>  \"histogramming\" in tablite is per column: <code>table['a'].histogram()</code>  string methods? Just use a list comprehensions: <code>table['A', 'B'].any(A=lambda x: \"hello\" in x, B=lambda x: \"world\" in x)</code> Merge Concatenation: Just use <code>+</code> or <code>+=</code> as in <code>t1 = t2 + t3 += t4</code>. If the columns are out of order, tablite will sort the headers according to the order in the first table.If you're worried that the header mismatch use <code>t1.stack(t2)</code>  Joins are ANSI92 compliant: <code>t1.join(t2, &lt;...args...&gt;, join_type=...)</code>. Grouping Tablite supports multikey groupby using <code>from tablite import Groupby as gb</code>. <code>table.groupby(keys, functions)</code> Reshaping To reshape a table use <code>transpose</code>.  to perform pivot table like operations, use: <code>table.pivot(rows, columns, functions)</code> subtotals aside tablite will give you everything Excels pivot table can do. Time series To convert time series use a list comprehension.<code>t1['GMT'] = [timedelta(hours=1) + v for v in t1['date'] ]</code>  to generate a date range use:<code>from Tablite import daterange</code><code>t['date'] = date_range(start=2022/1/1, stop=2023/1/1, step=timedelta(days=1))</code> Categorical Pandas only seems to use this for sorting and grouping.  Tablite table has <code>.sort</code>, <code>.groupby</code> and <code>.pivot</code>  to achieve the same task. Plotting Import your favorite plotting package and feed it the values, such as:<code>import matplotlib.pyplot as plt</code> <code>plt.plot(t['a'],t['b'])</code> <code>plt.showw()</code> Import/Export Tablite supports the same import/export options as pandas.Tablite pegs the free memory before IO and can therefore process larger-than-RAM files. Tablite also guesses the datatypes for all ISOformats and uses multiprocessing and may therefore be faster. Should you want to inspect how guess works, use <code>from tools import guess</code> and try the function out. Gotchas None really. Should you come across something non-pythonic, then please post it on the issue list."},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#tablite.base","title":"<code>tablite.base</code>","text":""},{"location":"reference/base/#tablite.base-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.log","title":"<code>tablite.base.log = logging.getLogger(__name__)</code>  <code>module-attribute</code>","text":""},{"location":"reference/base/#tablite.base.file_registry","title":"<code>tablite.base.file_registry = set()</code>  <code>module-attribute</code>","text":""},{"location":"reference/base/#tablite.base-classes","title":"Classes","text":""},{"location":"reference/base/#tablite.base.SimplePage","title":"<code>tablite.base.SimplePage(id, path, len, py_dtype)</code>","text":"<p>             Bases: <code>object</code></p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(self, id, path, len, py_dtype) -&gt; None:\n    self.path = Path(path) / \"pages\" / f\"{id}.npy\"\n    self.len = len\n    self.dtype = py_dtype\n\n    self._incr_refcount()\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.SimplePage.ids","title":"<code>tablite.base.SimplePage.ids = count(start=1)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.refcounts","title":"<code>tablite.base.SimplePage.refcounts = {}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.autocleanup","title":"<code>tablite.base.SimplePage.autocleanup = True</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.path","title":"<code>tablite.base.SimplePage.path = Path(path) / 'pages' / f'{id}.npy'</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.len","title":"<code>tablite.base.SimplePage.len = len</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.dtype","title":"<code>tablite.base.SimplePage.dtype = py_dtype</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.SimplePage.__setstate__","title":"<code>tablite.base.SimplePage.__setstate__(state)</code>","text":"<p>when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called</p> Source code in <code>tablite/base.py</code> <pre><code>def __setstate__(self, state):\n    \"\"\"\n    when an object is unpickled, say in a case of multi-processing,\n    object.__setstate__(state) is called instead of __init__, this means\n    we need to update page refcount as if constructor had been called\n    \"\"\"\n    self.__dict__.update(state)\n\n    self._incr_refcount()\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.next_id","title":"<code>tablite.base.SimplePage.next_id(path)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef next_id(cls, path):\n    path = Path(path)\n\n    while True:\n        _id = f\"{os.getpid()}-{next(cls.ids)}\"\n        _path = path / \"pages\" / f\"{_id}.npy\"\n\n        if not _path.exists():\n            break  # make sure we don't override existing pages if they are created outside of main thread\n\n    return _id\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__len__","title":"<code>tablite.base.SimplePage.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):\n    return self.len\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__repr__","title":"<code>tablite.base.SimplePage.__repr__() -&gt; str</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self) -&gt; str:\n    try:\n        return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n    except FileNotFoundError as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{type(e).__name__}&gt;)\"\n    except Exception as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{e}&gt;)\"\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__hash__","title":"<code>tablite.base.SimplePage.__hash__() -&gt; int</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __hash__(self) -&gt; int:\n    return hash(self.path)\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.owns","title":"<code>tablite.base.SimplePage.owns()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def owns(self):\n    parts = self.path.parts\n\n    return all((p in parts for p in Path(Config.pid).parts))\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__del__","title":"<code>tablite.base.SimplePage.__del__()</code>","text":"<p>When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def __del__(self):\n    \"\"\"When python's reference count for an object is 0, python uses\n    it's garbage collector to remove the object and free the memory.\n    As tablite tables have columns and columns have page and pages have\n    data stored on disk, the space on disk must be freed up as well.\n    This __del__ override assures the cleanup of stored data.\n    \"\"\"\n    if not self.owns():\n        return\n\n    refcount = self.refcounts[self.path] = max(\n        self.refcounts.get(self.path, 0) - 1, 0\n    )\n\n    if refcount &gt; 0:\n        return\n\n    if self.autocleanup:\n        self.path.unlink(True)\n\n    del self.refcounts[self.path]\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.get","title":"<code>tablite.base.SimplePage.get()</code>","text":"<p>loads stored data</p> RETURNS DESCRIPTION <p>np.ndarray: stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def get(self):\n    \"\"\"loads stored data\n\n    Returns:\n        np.ndarray: stored data.\n    \"\"\"\n    array = load_numpy(self.path)\n    return MetaArray(array, array.dtype, py_dtype=self.dtype)\n</code></pre>"},{"location":"reference/base/#tablite.base.Page","title":"<code>tablite.base.Page(path, array)</code>","text":"<p>             Bases: <code>SimplePage</code></p> PARAMETER  DESCRIPTION <code>path</code> <p>working directory.</p> <p> TYPE: <code>Path</code> </p> <code>array</code> <p>data</p> <p> TYPE: <code>array</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(self, path, array) -&gt; None:\n    \"\"\"\n    Args:\n        path (Path): working directory.\n        array (np.array): data\n    \"\"\"\n    _id = self.next_id(path)\n\n    type_check(array, np.ndarray)\n\n    if Config.DISK_LIMIT &lt;= 0:\n        pass\n    else:\n        _, _, free = shutil.disk_usage(path)\n        if free - array.nbytes &lt; Config.DISK_LIMIT:\n            msg = \"\\n\".join(\n                [\n                    f\"Disk limit reached: Config.DISK_LIMIT = {Config.DISK_LIMIT:,} bytes.\",\n                    f\"array requires {array.nbytes:,} bytes, but only {free:,} bytes are free.\",\n                    \"To disable this check, use:\",\n                    \"&gt;&gt;&gt; from tablite.config import Config\",\n                    \"&gt;&gt;&gt; Config.DISK_LIMIT = 0\",\n                    \"To free space, clean up Config.workdir:\",\n                    f\"{Config.workdir}\",\n                ]\n            )\n            raise OSError(msg)\n\n    _len = len(array)\n    # type_check(array, MetaArray)\n    if not hasattr(array, \"metadata\"):\n        raise ValueError\n    _dtype = array.metadata[\"py_dtype\"]\n\n    super().__init__(_id, path, _len, _dtype)\n\n    np.save(self.path, array, allow_pickle=True, fix_imports=False)\n    log.debug(f\"Page saved: {self.path}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Page-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Page.ids","title":"<code>tablite.base.Page.ids = count(start=1)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.refcounts","title":"<code>tablite.base.Page.refcounts = {}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.autocleanup","title":"<code>tablite.base.Page.autocleanup = True</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.path","title":"<code>tablite.base.Page.path = Path(path) / 'pages' / f'{id}.npy'</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.len","title":"<code>tablite.base.Page.len = len</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.dtype","title":"<code>tablite.base.Page.dtype = py_dtype</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Page.__setstate__","title":"<code>tablite.base.Page.__setstate__(state)</code>","text":"<p>when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called</p> Source code in <code>tablite/base.py</code> <pre><code>def __setstate__(self, state):\n    \"\"\"\n    when an object is unpickled, say in a case of multi-processing,\n    object.__setstate__(state) is called instead of __init__, this means\n    we need to update page refcount as if constructor had been called\n    \"\"\"\n    self.__dict__.update(state)\n\n    self._incr_refcount()\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.next_id","title":"<code>tablite.base.Page.next_id(path)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef next_id(cls, path):\n    path = Path(path)\n\n    while True:\n        _id = f\"{os.getpid()}-{next(cls.ids)}\"\n        _path = path / \"pages\" / f\"{_id}.npy\"\n\n        if not _path.exists():\n            break  # make sure we don't override existing pages if they are created outside of main thread\n\n    return _id\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__len__","title":"<code>tablite.base.Page.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):\n    return self.len\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__repr__","title":"<code>tablite.base.Page.__repr__() -&gt; str</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self) -&gt; str:\n    try:\n        return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n    except FileNotFoundError as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{type(e).__name__}&gt;)\"\n    except Exception as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{e}&gt;)\"\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__hash__","title":"<code>tablite.base.Page.__hash__() -&gt; int</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __hash__(self) -&gt; int:\n    return hash(self.path)\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.owns","title":"<code>tablite.base.Page.owns()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def owns(self):\n    parts = self.path.parts\n\n    return all((p in parts for p in Path(Config.pid).parts))\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__del__","title":"<code>tablite.base.Page.__del__()</code>","text":"<p>When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def __del__(self):\n    \"\"\"When python's reference count for an object is 0, python uses\n    it's garbage collector to remove the object and free the memory.\n    As tablite tables have columns and columns have page and pages have\n    data stored on disk, the space on disk must be freed up as well.\n    This __del__ override assures the cleanup of stored data.\n    \"\"\"\n    if not self.owns():\n        return\n\n    refcount = self.refcounts[self.path] = max(\n        self.refcounts.get(self.path, 0) - 1, 0\n    )\n\n    if refcount &gt; 0:\n        return\n\n    if self.autocleanup:\n        self.path.unlink(True)\n\n    del self.refcounts[self.path]\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.get","title":"<code>tablite.base.Page.get()</code>","text":"<p>loads stored data</p> RETURNS DESCRIPTION <p>np.ndarray: stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def get(self):\n    \"\"\"loads stored data\n\n    Returns:\n        np.ndarray: stored data.\n    \"\"\"\n    array = load_numpy(self.path)\n    return MetaArray(array, array.dtype, py_dtype=self.dtype)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column","title":"<code>tablite.base.Column(path, value=None)</code>","text":"<p>             Bases: <code>object</code></p> <p>Create Column</p> PARAMETER  DESCRIPTION <code>path</code> <p>path of table.yml (defaults: Config.pid_dir)</p> <p> TYPE: <code>Path</code> </p> <code>value</code> <p>Data to store. Defaults to None.</p> <p> TYPE: <code>Iterable</code> DEFAULT: <code>None</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(self, path, value=None) -&gt; None:\n    \"\"\"Create Column\n\n    Args:\n        path (Path): path of table.yml (defaults: Config.pid_dir)\n        value (Iterable, optional): Data to store. Defaults to None.\n    \"\"\"\n    self.path = path\n    self.pages = []  # keeps pointers to instances of Page\n    if value is not None:\n        self.extend(value)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Column.path","title":"<code>tablite.base.Column.path = path</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Column.pages","title":"<code>tablite.base.Column.pages = []</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Column-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Column.__len__","title":"<code>tablite.base.Column.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):\n    return sum(len(p) for p in self.pages)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__repr__","title":"<code>tablite.base.Column.__repr__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self):\n    return f\"{self.__class__.__name__}({self.path}, {self[:]})\"\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.repaginate","title":"<code>tablite.base.Column.repaginate()</code>","text":"<p>resizes pages to Config.PAGE_SIZE</p> Source code in <code>tablite/base.py</code> <pre><code>def repaginate(self):\n    \"\"\"resizes pages to Config.PAGE_SIZE\"\"\"\n    from tablite.nimlite import repaginate as _repaginate\n\n    _repaginate(self)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.extend","title":"<code>tablite.base.Column.extend(value)</code>","text":"<p>extends the column.</p> PARAMETER  DESCRIPTION <code>value</code> <p>data</p> <p> TYPE: <code>ndarray</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def extend(self, value):  # USER FUNCTION.\n    \"\"\"extends the column.\n\n    Args:\n        value (np.ndarray): data\n    \"\"\"\n    if isinstance(value, Column):\n        self.pages.extend(value.pages[:])\n        return\n    elif isinstance(value, np.ndarray):\n        pass\n    elif isinstance(value, (list, tuple)):\n        value = list_to_np_array(value)\n    else:\n        raise TypeError(f\"Cannot extend Column with {type(value)}\")\n    type_check(value, np.ndarray)\n    for array in self._paginate(value):\n        self.pages.append(Page(path=self.path, array=array))\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.clear","title":"<code>tablite.base.Column.clear()</code>","text":"<p>clears the column. Like list().clear()</p> Source code in <code>tablite/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    clears the column. Like list().clear()\n    \"\"\"\n    self.pages.clear()\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.getpages","title":"<code>tablite.base.Column.getpages(item)</code>","text":"<p>public non-user function to identify any pages + slices of data to be retrieved given a slice (item)</p> PARAMETER  DESCRIPTION <code>item</code> <p>target slice of data</p> <p> TYPE: <code>(int, slice)</code> </p> RETURNS DESCRIPTION <p>list of pages/np.ndarrays.</p> <p>Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)] This helps, for example when creating a copy, as the copy can reference the pages 1 and 2 and only need to store the np.ndarray that is unique to it.</p> Source code in <code>tablite/base.py</code> <pre><code>def getpages(self, item):\n    \"\"\"public non-user function to identify any pages + slices\n    of data to be retrieved given a slice (item)\n\n    Args:\n        item (int,slice): target slice of data\n\n    Returns:\n        list of pages/np.ndarrays.\n\n    Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)]\n    This helps, for example when creating a copy, as the copy\n    can reference the pages 1 and 2 and only need to store\n    the np.ndarray that is unique to it.\n    \"\"\"\n    # internal function\n    if isinstance(item, int):\n        if item &lt; 0:\n            item = len(self) + item\n        item = slice(item, item + 1, 1)\n\n    type_check(item, slice)\n    is_reversed = False if (item.step is None or item.step &gt; 0) else True\n\n    length = len(self)\n    scan_item = slice(*item.indices(length))\n    range_item = range(*item.indices(length))\n\n    pages = []\n    start, end = 0, 0\n    for page in self.pages:\n        start, end = end, end + page.len\n        if is_reversed:\n            if start &gt; scan_item.start:\n                break\n            if end &lt; scan_item.stop:\n                continue\n        else:\n            if start &gt; scan_item.stop:\n                break\n            if end &lt; scan_item.start:\n                continue\n        ro = intercept(range(start, end), range_item)\n        if len(ro) == 0:\n            continue\n        elif len(ro) == page.len:  # share the whole immutable page\n            pages.append(page)\n        else:  # fetch the slice and filter it.\n            search_slice = slice(ro.start - start, ro.stop - start, ro.step)\n            np_arr = load_numpy(page.path)\n            match = np_arr[search_slice]\n            pages.append(match)\n\n    if is_reversed:\n        pages.reverse()\n        for ix, page in enumerate(pages):\n            if isinstance(page, SimplePage):\n                data = page.get()\n                pages[ix] = np.flip(data)\n            else:\n                pages[ix] = np.flip(page)\n\n    return pages\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.iter_by_page","title":"<code>tablite.base.Column.iter_by_page()</code>","text":"<p>iterates over the column, page by page. This method minimizes the number of reads.</p> RETURNS DESCRIPTION <p>generator of tuple: start: int end: int data: np.ndarray</p> Source code in <code>tablite/base.py</code> <pre><code>def iter_by_page(self):\n    \"\"\"iterates over the column, page by page.\n    This method minimizes the number of reads.\n\n    Returns:\n        generator of tuple:\n            start: int\n            end: int\n            data: np.ndarray\n    \"\"\"\n    start, end = 0, 0\n    for page in self.pages:\n        start, end = end, end + page.len\n        yield start, end, page\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__getitem__","title":"<code>tablite.base.Column.__getitem__(item)</code>","text":"<p>gets numpy array.</p> PARAMETER  DESCRIPTION <code>item</code> <p>slice of column</p> <p> TYPE: <code>int OR slice</code> </p> RETURNS DESCRIPTION <p>np.ndarray: results as numpy array.</p> <p>Remember:</p> <pre><code>&gt;&gt;&gt; R = np.array([0,1,2,3,4,5])\n&gt;&gt;&gt; R[3]\n3\n&gt;&gt;&gt; R[3:4]\narray([3])\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __getitem__(self, item):  # USER FUNCTION.\n    \"\"\"gets numpy array.\n\n    Args:\n        item (int OR slice): slice of column\n\n    Returns:\n        np.ndarray: results as numpy array.\n\n    Remember:\n    ```\n    &gt;&gt;&gt; R = np.array([0,1,2,3,4,5])\n    &gt;&gt;&gt; R[3]\n    3\n    &gt;&gt;&gt; R[3:4]\n    array([3])\n    ```\n    \"\"\"\n    result = []\n    for element in self.getpages(item):\n        if isinstance(element, SimplePage):\n            result.append(element.get())\n        else:\n            result.append(element)\n\n    if result:\n        arr = np_type_unify(result)\n    else:\n        arr = np.array([])\n\n    if isinstance(item, int):\n        if len(arr) == 0:\n            raise IndexError(\n                f\"index {item} is out of bounds for axis 0 with size {len(self)}\"\n            )\n        return numpy_to_python(arr[0])\n    else:\n        return arr\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__setitem__","title":"<code>tablite.base.Column.__setitem__(key, value)</code>","text":"<p>sets values.</p> PARAMETER  DESCRIPTION <code>key</code> <p>selector</p> <p> TYPE: <code>(int, slice)</code> </p> <code>value</code> <p>values to insert</p> <p> TYPE: <code>any</code> </p> RAISES DESCRIPTION <code>KeyError</code> <p>Following normal slicing rules</p> Source code in <code>tablite/base.py</code> <pre><code>def __setitem__(self, key, value):  # USER FUNCTION.\n    \"\"\"sets values.\n\n    Args:\n        key (int,slice): selector\n        value (any): values to insert\n\n    Raises:\n        KeyError: Following normal slicing rules\n    \"\"\"\n    if isinstance(key, int):\n        self._setitem_integer_key(key, value)\n\n    elif isinstance(key, slice):\n        if not isinstance(value, np.ndarray):\n            value = list_to_np_array(value)\n        type_check(value, np.ndarray)\n\n        if key.start is None and key.stop is None and key.step in (None, 1):\n            self._setitem_replace_all(key, value)\n        elif key.start is not None and key.stop is None and key.step in (None, 1):\n            self._setitem_extend(key, value)\n        elif key.stop is not None and key.start is None and key.step in (None, 1):\n            self._setitem_prextend(key, value)\n        elif (\n            key.step in (None, 1) and key.start is not None and key.stop is not None\n        ):\n            self._setitem_insert(key, value)\n        elif key.step not in (None, 1):\n            self._setitem_update(key, value)\n        else:\n            raise KeyError(f\"bad key: {key}\")\n    else:\n        raise KeyError(f\"bad key: {key}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__delitem__","title":"<code>tablite.base.Column.__delitem__(key)</code>","text":"<p>deletes items selected by key</p> PARAMETER  DESCRIPTION <code>key</code> <p>selector</p> <p> TYPE: <code>(int, slice)</code> </p> RAISES DESCRIPTION <code>KeyError</code> <p>following normal slicing rules.</p> Source code in <code>tablite/base.py</code> <pre><code>def __delitem__(self, key):  # USER FUNCTION\n    \"\"\"deletes items selected by key\n\n    Args:\n        key (int,slice): selector\n\n    Raises:\n        KeyError: following normal slicing rules.\n    \"\"\"\n    if isinstance(key, int):\n        self._del_by_int(key)\n    elif isinstance(key, slice):\n        self._del_by_slice(key)\n    else:\n        raise KeyError(f\"bad key: {key}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.get_by_indices","title":"<code>tablite.base.Column.get_by_indices(indices: Union[List[int], np.ndarray]) -&gt; np.ndarray</code>","text":"<p>retrieves values from column given a set of indices.</p> PARAMETER  DESCRIPTION <code>indices</code> <p>targets</p> <p> TYPE: <code>array</code> </p> <p>This method uses np.take, is faster than iterating over rows. Examples:</p> <pre><code>&gt;&gt;&gt; indices = np.array(list(range(3,700_700, 426)))\n&gt;&gt;&gt; arr = np.array(list(range(2_000_000)))\nPythonic:\n&gt;&gt;&gt; [v for i,v in enumerate(arr) if i in indices]\nNumpyionic:\n&gt;&gt;&gt; np.take(arr, indices)\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def get_by_indices(self, indices: Union[List[int], np.ndarray]) -&gt; np.ndarray:\n    \"\"\"retrieves values from column given a set of indices.\n\n    Args:\n        indices (np.array): targets\n\n    This method uses np.take, is faster than iterating over rows.\n    Examples:\n    ```\n    &gt;&gt;&gt; indices = np.array(list(range(3,700_700, 426)))\n    &gt;&gt;&gt; arr = np.array(list(range(2_000_000)))\n    Pythonic:\n    &gt;&gt;&gt; [v for i,v in enumerate(arr) if i in indices]\n    Numpyionic:\n    &gt;&gt;&gt; np.take(arr, indices)\n    ```\n    \"\"\"\n    type_check(indices, np.ndarray)\n\n    dtypes = set()\n    values = np.empty(\n        indices.shape, dtype=object\n    )  # placeholder for the indexed values.\n\n    for start, end, page in self.iter_by_page():\n        range_match = np.asarray(((indices &gt;= start) &amp; (indices &lt; end)) | (indices == -1)).nonzero()[0]\n        if len(range_match):\n            # only fetch the data if there's a range match!\n            data = page.get() \n            sub_index = np.take(indices, range_match)\n            # sub_index2 otherwise will raise index error where len(data) &gt; (-1 - start)\n            # so the clause below is required:\n            if len(data) &gt; (-1 - start):\n                sub_index = np.where(sub_index == -1, -1, sub_index - start)\n            arr = np.take(data, sub_index)\n            dtypes.add(arr.dtype)\n            np.put(values, range_match, arr)\n\n    if len(dtypes) == 1:  # simplify the datatype\n        dtype = next(iter(dtypes))\n        values = np.array(values, dtype=dtype)\n    return values\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__iter__","title":"<code>tablite.base.Column.__iter__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __iter__(self):  # USER FUNCTION.\n    for page in self.pages:\n        data = page.get()\n        for value in data:\n            yield value\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__eq__","title":"<code>tablite.base.Column.__eq__(other)</code>","text":"<p>compares two columns. Like <code>list1 == list2</code></p> Source code in <code>tablite/base.py</code> <pre><code>def __eq__(self, other):  # USER FUNCTION.\n    \"\"\"\n    compares two columns. Like `list1 == list2`\n    \"\"\"\n    if len(self) != len(other):  # quick cheap check.\n        return False\n\n    if isinstance(other, (list, tuple)):\n        return all(a == b for a, b in zip(self[:], other))\n\n    elif isinstance(other, Column):\n        if self.pages == other.pages:  # special case.\n            return True\n\n        # are the pages of same size?\n        if len(self.pages) == len(other.pages):\n            if [p.len for p in self.pages] == [p.len for p in other.pages]:\n                for a, b in zip(self.pages, other.pages):\n                    if not (a.get() == b.get()).all():\n                        return False\n                return True\n        # to bad. Element comparison it is then:\n        for a, b in zip(iter(self), iter(other)):\n            if a != b:\n                return False\n        return True\n\n    elif isinstance(other, np.ndarray):\n        start, end = 0, 0\n        for p in self.pages:\n            start, end = end, end + p.len\n            if not (p.get() == other[start:end]).all():\n                return False\n        return True\n    else:\n        raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__ne__","title":"<code>tablite.base.Column.__ne__(other)</code>","text":"<p>compares two columns. Like <code>list1 != list2</code></p> Source code in <code>tablite/base.py</code> <pre><code>def __ne__(self, other):  # USER FUNCTION\n    \"\"\"\n    compares two columns. Like `list1 != list2`\n    \"\"\"\n    if len(self) != len(other):  # quick cheap check.\n        return True\n\n    if isinstance(other, (list, tuple)):\n        return any(a != b for a, b in zip(self[:], other))\n\n    elif isinstance(other, Column):\n        if self.pages == other.pages:  # special case.\n            return False\n\n        # are the pages of same size?\n        if len(self.pages) == len(other.pages):\n            if [p.len for p in self.pages] == [p.len for p in other.pages]:\n                for a, b in zip(self.pages, other.pages):\n                    if not (a.get() == b.get()).all():\n                        return True\n                return False\n        # to bad. Element comparison it is then:\n        for a, b in zip(iter(self), iter(other)):\n            if a != b:\n                return True\n        return False\n\n    elif isinstance(other, np.ndarray):\n        start, end = 0, 0\n        for p in self.pages:\n            start, end = end, end + p.len\n            if (p.get() != other[start:end]).any():\n                return True\n        return False\n    else:\n        raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.copy","title":"<code>tablite.base.Column.copy()</code>","text":"<p>returns deep=copy of Column</p> RETURNS DESCRIPTION <p>Column</p> Source code in <code>tablite/base.py</code> <pre><code>def copy(self):\n    \"\"\"returns deep=copy of Column\n\n    Returns:\n        Column\n    \"\"\"\n    cp = Column(path=self.path)\n    cp.pages = self.pages[:]\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__copy__","title":"<code>tablite.base.Column.__copy__()</code>","text":"<p>see copy</p> Source code in <code>tablite/base.py</code> <pre><code>def __copy__(self):\n    \"\"\"see copy\"\"\"\n    return self.copy()\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__imul__","title":"<code>tablite.base.Column.__imul__(other)</code>","text":"<p>Repeats instance of column N times. Like list() * N</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; one = Column(data=[1,2])\n&gt;&gt;&gt; one *= 5\n&gt;&gt;&gt; one\n[1,2, 1,2, 1,2, 1,2, 1,2]\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __imul__(self, other):\n    \"\"\"\n    Repeats instance of column N times. Like list() * N\n\n    Example:\n    ```\n    &gt;&gt;&gt; one = Column(data=[1,2])\n    &gt;&gt;&gt; one *= 5\n    &gt;&gt;&gt; one\n    [1,2, 1,2, 1,2, 1,2, 1,2]\n    ```\n    \"\"\"\n    if not (isinstance(other, int) and other &gt; 0):\n        raise TypeError(\n            f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    self.pages = self.pages[:] * other\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__mul__","title":"<code>tablite.base.Column.__mul__(other)</code>","text":"<p>Repeats instance of column N times. Like list() * N</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; one = Column(data=[1,2])\n&gt;&gt;&gt; two = one * 5\n&gt;&gt;&gt; two\n[1,2, 1,2, 1,2, 1,2, 1,2]\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __mul__(self, other):\n    \"\"\"\n    Repeats instance of column N times. Like list() * N\n\n    Example:\n    ```\n    &gt;&gt;&gt; one = Column(data=[1,2])\n    &gt;&gt;&gt; two = one * 5\n    &gt;&gt;&gt; two\n    [1,2, 1,2, 1,2, 1,2, 1,2]\n    ```\n    \"\"\"\n    if not isinstance(other, int):\n        raise TypeError(\n            f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    cp = self.copy()\n    cp *= other\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__iadd__","title":"<code>tablite.base.Column.__iadd__(other)</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __iadd__(self, other):\n    if isinstance(other, (list, tuple)):\n        other = list_to_np_array(other)\n        self.extend(other)\n    elif isinstance(other, Column):\n        self.pages.extend(other.pages[:])\n    else:\n        raise TypeError(f\"{type(other)} not supported.\")\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__contains__","title":"<code>tablite.base.Column.__contains__(item)</code>","text":"<p>determines if item is in the Column. Similar to <code>'x' in ['a','b','c']</code> returns boolean</p> PARAMETER  DESCRIPTION <code>item</code> <p>value to search for</p> <p> TYPE: <code>any</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>True if item exists in column.</p> Source code in <code>tablite/base.py</code> <pre><code>def __contains__(self, item):\n    \"\"\"determines if item is in the Column.\n    Similar to `'x' in ['a','b','c']`\n    returns boolean\n\n    Args:\n        item (any): value to search for\n\n    Returns:\n        bool: True if item exists in column.\n    \"\"\"\n    for page in set(self.pages):\n        if item in page.get():  # x in np.ndarray([...]) uses np.any(arr, value)\n            return True\n    return False\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.remove_all","title":"<code>tablite.base.Column.remove_all(*values)</code>","text":"<p>removes all values of <code>values</code></p> Source code in <code>tablite/base.py</code> <pre><code>def remove_all(self, *values):\n    \"\"\"\n    removes all values of `values`\n    \"\"\"\n    type_check(values, tuple)\n    if isinstance(values[0], tuple):\n        values = values[0]\n    to_remove = list_to_np_array(values)\n    for index, page in enumerate(self.pages):\n        data = page.get()\n        bitmask = np.isin(data, to_remove)  # identify elements to remove.\n        if bitmask.any():\n            bitmask = np.invert(bitmask)  # turn bitmask around to keep.\n            new_data = np.compress(bitmask, data)\n            new_page = Page(self.path, new_data)\n            self.pages[index] = new_page\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.replace","title":"<code>tablite.base.Column.replace(mapping)</code>","text":"<p>replaces values using a mapping.</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>{value to replace: new value, ...}</p> <p> TYPE: <code>dict</code> </p> <p>Example:</p> <pre><code>&gt;&gt;&gt; t = Table(columns={'A': [1,2,3,4]})\n&gt;&gt;&gt; t['A'].replace({2:20,4:40})\n&gt;&gt;&gt; t[:]\nnp.ndarray([1,20,3,40])\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def replace(self, mapping):\n    \"\"\"\n    replaces values using a mapping.\n\n    Args:\n        mapping (dict): {value to replace: new value, ...}\n\n    Example:\n    ```\n    &gt;&gt;&gt; t = Table(columns={'A': [1,2,3,4]})\n    &gt;&gt;&gt; t['A'].replace({2:20,4:40})\n    &gt;&gt;&gt; t[:]\n    np.ndarray([1,20,3,40])\n    ```\n    \"\"\"\n    type_check(mapping, dict)\n    to_replace = np.array(list(mapping.keys()))\n    for index, page in enumerate(self.pages):\n        data = page.get()\n        bitmask = np.isin(data, to_replace)  # identify elements to replace.\n        if bitmask.any():\n            warray = np.compress(bitmask, data)\n            py_dtype = page.dtype\n            for ix, v in enumerate(warray):\n                old_py_val = numpy_to_python(v)\n                new_py_val = mapping[old_py_val]\n                old_dt = type(old_py_val)\n                new_dt = type(new_py_val)\n\n                warray[ix] = new_py_val\n\n                py_dtype[new_dt] = py_dtype.get(new_dt, 0) + 1\n                py_dtype[old_dt] = py_dtype.get(old_dt, 0) - 1\n\n                if py_dtype[old_dt] &lt;= 0:\n                    del py_dtype[old_dt]\n\n            data[bitmask] = warray\n            self.pages[index] = Page(path=self.path, array=data)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.types","title":"<code>tablite.base.Column.types()</code>","text":"<p>returns dict with python datatypes</p> RETURNS DESCRIPTION <code>dict</code> <p>frequency of occurrence of python datatypes</p> Source code in <code>tablite/base.py</code> <pre><code>def types(self):\n    \"\"\"\n    returns dict with python datatypes\n\n    Returns:\n        dict: frequency of occurrence of python datatypes\n    \"\"\"\n    d = Counter()\n    for page in self.pages:\n        assert isinstance(page.dtype, dict)\n        d += page.dtype\n    return dict(d)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.index","title":"<code>tablite.base.Column.index()</code>","text":"<p>returns dict with { unique entry : list of indices }</p> <p>example:</p> <pre><code>&gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n&gt;&gt;&gt; c.index()\n{'a':[0,2], 'b': [1,4], 'c': [3]}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def index(self):\n    \"\"\"\n    returns dict with { unique entry : list of indices }\n\n    example:\n    ```\n    &gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n    &gt;&gt;&gt; c.index()\n    {'a':[0,2], 'b': [1,4], 'c': [3]}\n    ```\n    \"\"\"\n    d = defaultdict(list)\n    for ix, v in enumerate(self.__iter__()):\n        d[v].append(ix)\n    return dict(d)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.unique","title":"<code>tablite.base.Column.unique()</code>","text":"<p>returns unique list of values.</p> <p>example:</p> <pre><code>&gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n&gt;&gt;&gt; c.unqiue()\n['a','b','c']\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def unique(self):\n    \"\"\"\n    returns unique list of values.\n\n    example:\n    ```\n    &gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n    &gt;&gt;&gt; c.unqiue()\n    ['a','b','c']\n    ```\n    \"\"\"\n    arrays = []\n    for page in set(self.pages):\n        try:  # when it works, numpy is fast...\n            arrays.append(np.unique(page.get()))\n        except TypeError:  # ...but np.unique cannot handle Nones.\n            arrays.append(multitype_set(page.get()))\n    union = np_type_unify(arrays)\n    try:\n        return np.unique(union)\n    except MemoryError:\n        return np.array(set(union))\n    except TypeError:\n        return multitype_set(union)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.histogram","title":"<code>tablite.base.Column.histogram()</code>","text":"<p>returns 2 arrays: unique elements and count of each element</p> <p>example:</p> <pre><code>&gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n&gt;&gt;&gt; c.histogram()\n{'a':2,'b':2,'c':1}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def histogram(self):\n    \"\"\"\n    returns 2 arrays: unique elements and count of each element\n\n    example:\n    ```\n    &gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n    &gt;&gt;&gt; c.histogram()\n    {'a':2,'b':2,'c':1}\n    ```\n    \"\"\"\n    d = defaultdict(int)\n    for page in self.pages:\n        try:\n            uarray, carray = np.unique(page.get(), return_counts=True)\n        except TypeError:\n            uarray = page.get()\n            carray = repeat(1, len(uarray))\n\n        for i, c in zip(uarray, carray):\n            v = numpy_to_python(i)\n            d[(type(v), v)] += numpy_to_python(c)\n    u = [v for _, v in d.keys()]\n    c = list(d.values())\n    return u, c  # unique, counts\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.statistics","title":"<code>tablite.base.Column.statistics()</code>","text":"<p>provides summary statistics.</p> RETURNS DESCRIPTION <code>dict</code> <p>returns dict with:</p> <ul> <li>min (int/float, length of str, date)</li> </ul> <ul> <li>max (int/float, length of str, date)</li> </ul> <ul> <li>mean (int/float, length of str, date)</li> </ul> <ul> <li>median (int/float, length of str, date)</li> </ul> <ul> <li>stdev (int/float, length of str, date)</li> </ul> <ul> <li>mode (int/float, length of str, date)</li> </ul> <ul> <li>distinct (int/float, length of str, date)</li> </ul> <ul> <li>iqr (int/float, length of str, date)</li> </ul> <ul> <li>sum (int/float, length of str, date)</li> </ul> <ul> <li>histogram (see .histogram)</li> </ul> Source code in <code>tablite/base.py</code> <pre><code>def statistics(self):\n    \"\"\"provides summary statistics.\n\n    Returns:\n        dict: returns dict with:\n        - min (int/float, length of str, date)\n        - max (int/float, length of str, date)\n        - mean (int/float, length of str, date)\n        - median (int/float, length of str, date)\n        - stdev (int/float, length of str, date)\n        - mode (int/float, length of str, date)\n        - distinct (int/float, length of str, date)\n        - iqr (int/float, length of str, date)\n        - sum (int/float, length of str, date)\n        - histogram (see .histogram)\n    \"\"\"\n    values, counts = self.histogram()\n    return summary_statistics(values, counts)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.count","title":"<code>tablite.base.Column.count(item)</code>","text":"<p>counts appearances of item in column.</p> <p>Note that in python, <code>True == 1</code> and <code>False == 0</code>, whereby the following difference occurs:</p> <p>in python:</p> <pre><code>&gt;&gt;&gt; L = [1, True]\n&gt;&gt;&gt; L.count(True)\n2\n</code></pre> <p>in tablite:</p> <pre><code>&gt;&gt;&gt; t = Table({'L': [1,True]})\n&gt;&gt;&gt; t['L'].count(True)\n1\n</code></pre> PARAMETER  DESCRIPTION <code>item</code> <p>target item</p> <p> TYPE: <code>Any</code> </p> RETURNS DESCRIPTION <code>int</code> <p>number of occurrences of item.</p> Source code in <code>tablite/base.py</code> <pre><code>def count(self, item):\n    \"\"\"counts appearances of item in column.\n\n    Note that in python, `True == 1` and `False == 0`,\n    whereby the following difference occurs:\n\n    in python:\n    ```\n    &gt;&gt;&gt; L = [1, True]\n    &gt;&gt;&gt; L.count(True)\n    2\n    ```\n    in tablite:\n    ```\n    &gt;&gt;&gt; t = Table({'L': [1,True]})\n    &gt;&gt;&gt; t['L'].count(True)\n    1\n    ```\n\n    Args:\n        item (Any): target item\n\n    Returns:\n        int: number of occurrences of item.\n    \"\"\"\n    result = 0\n    for page in self.pages:\n        data = page.get()\n        if data.dtype != \"O\":\n            result += np.nonzero(page.get() == item)[0].shape[0]\n            # what happens here ---^ below:\n            # arr = page.get()\n            # &gt;&gt;&gt; arr\n            # array([1,2,3,4,3], int64)\n            # &gt;&gt;&gt; (arr == 3)\n            # array([False, False,  True, False,  True])\n            # &gt;&gt;&gt; np.nonzero(arr==3)\n            # (array([2,4], dtype=int64), )  &lt;-- tuple!\n            # &gt;&gt;&gt; np.nonzero(page.get() == item)[0]\n            # array([2,4])\n            # &gt;&gt;&gt; np.nonzero(page.get() == item)[0].shape\n            # (2, )\n            # &gt;&gt;&gt; np.nonzero(page.get() == item)[0].shape[0]\n            # 2\n        else:\n            result += sum(1 for i in data if type(i) == type(item) and i == item)\n    return result\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable","title":"<code>tablite.base.BaseTable(columns: [dict, None] = None, headers: [list, None] = None, rows: [list, None] = None, _path: [Path, None] = None)</code>","text":"<p>             Bases: <code>object</code></p> <p>creates Table</p> PARAMETER  DESCRIPTION <code>EITHER</code> <p>columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})</p> <p> </p> <code>_path</code> <p>path to main process working directory.</p> <p> TYPE: <code>Path</code> DEFAULT: <code>None</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(\n    self,\n    columns: [dict, None] = None,\n    headers: [list, None] = None,\n    rows: [list, None] = None,\n    _path: [Path, None] = None,\n) -&gt; None:\n    \"\"\"creates Table\n\n    Args:\n        EITHER:\n            columns (dict, optional): dict with column names as keys, values as lists.\n            Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n        OR\n            headers (list of strings, optional): list of column names.\n            rows (list of tuples or lists, optional): values for columns\n            Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n\n        _path (pathlib.Path, optional): path to main process working directory.\n    \"\"\"\n    if _path is None:\n        if self._pid_dir is None:\n            self._pid_dir = Path(Config.workdir) / Config.pid\n            if not self._pid_dir.exists():\n                self._pid_dir.mkdir()\n                (self._pid_dir / \"pages\").mkdir()\n            register(self._pid_dir)\n\n        _path = Path(self._pid_dir)\n        # if path exists under the given PID it will be overwritten.\n        # this can only happen if the process previously was SIGKILLed.\n    type_check(_path, Path)\n    self.path = _path  # filename used during multiprocessing.\n    self.columns = {}  # maps colunn names to instances of Column.\n\n    # user friendly features.\n    if columns and any((headers, rows)):\n        raise ValueError(\"Either columns as dict OR headers and rows. Not both.\")\n\n    if headers and rows:\n        rotated = list(zip(*rows))\n        columns = {k: v for k, v in zip(headers, rotated)}\n\n    if columns:\n        type_check(columns, dict)\n        for k, v in columns.items():\n            self.__setitem__(k, v)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.BaseTable.path","title":"<code>tablite.base.BaseTable.path = _path</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.BaseTable.columns","title":"<code>tablite.base.BaseTable.columns = {}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.BaseTable.rows","title":"<code>tablite.base.BaseTable.rows</code>  <code>property</code>","text":"<p>enables row based iteration in python types.</p> <p>Example:</p> <pre><code>for row in Table.rows:\n    print(row)\n</code></pre> <p>Yields:     tuple: values is same order as columns.</p>"},{"location":"reference/base/#tablite.base.BaseTable-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.BaseTable.__str__","title":"<code>tablite.base.BaseTable.__str__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __str__(self):  # USER FUNCTION.\n    return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__repr__","title":"<code>tablite.base.BaseTable.__repr__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self):\n    return self.__str__()\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.nbytes","title":"<code>tablite.base.BaseTable.nbytes()</code>","text":"<p>finds the total bytes of the table on disk</p> RETURNS DESCRIPTION <code>tuple</code> <p>int: real bytes used on disk int: total bytes used if flattened</p> Source code in <code>tablite/base.py</code> <pre><code>def nbytes(self):  # USER FUNCTION.\n    \"\"\"finds the total bytes of the table on disk\n\n    Returns:\n        tuple:\n            int: real bytes used on disk\n            int: total bytes used if flattened\n    \"\"\"\n    real = {}\n    total = 0\n    for column in self.columns.values():\n        for page in set(column.pages):\n            real[page] = page.path.stat().st_size\n        for page in column.pages:\n            total += real[page]\n    return sum(real.values()), total\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.items","title":"<code>tablite.base.BaseTable.items()</code>","text":"<p>returns table as dict</p> RETURNS DESCRIPTION <code>dict</code> <p>Table as dict <code>{column_name: [values], ...}</code></p> Source code in <code>tablite/base.py</code> <pre><code>def items(self):  # USER FUNCTION.\n    \"\"\"returns table as dict\n\n    Returns:\n        dict: Table as dict `{column_name: [values], ...}`\n    \"\"\"\n    return {\n        name: column[:].tolist() for name, column in self.columns.items()\n    }.items()\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__delitem__","title":"<code>tablite.base.BaseTable.__delitem__(key)</code>","text":"<p>Examples:</p> <pre><code>&gt;&gt;&gt; del table['a']  # removes column 'a'\n&gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __delitem__(self, key):  # USER FUNCTION.\n    \"\"\"\n    Examples:\n    ```\n    &gt;&gt;&gt; del table['a']  # removes column 'a'\n    &gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n    ```\n    \"\"\"\n    if isinstance(key, (int, slice)):\n        for column in self.columns.values():\n            del column[key]\n    elif key in self.columns:\n        del self.columns[key]\n    else:\n        raise KeyError(f\"Key not found: {key}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__setitem__","title":"<code>tablite.base.BaseTable.__setitem__(key, value)</code>","text":"<p>table behaves like a dict. Args:     key (str or hashable): column name     value (iterable): list, tuple or nd.array with values.</p> <p>As Table now accepts the keyword <code>columns</code> as a dict:</p> <pre><code>&gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n</code></pre> <p>and the header/data combinations:</p> <pre><code>&gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n</code></pre> <p>This has the side-benefit that tuples now can be used as headers.</p> Source code in <code>tablite/base.py</code> <pre><code>def __setitem__(self, key, value):  # USER FUNCTION\n    \"\"\"table behaves like a dict.\n    Args:\n        key (str or hashable): column name\n        value (iterable): list, tuple or nd.array with values.\n\n    As Table now accepts the keyword `columns` as a dict:\n    ```\n    &gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n    ```\n    and the header/data combinations:\n    ```\n    &gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n    ```\n    This has the side-benefit that tuples now can be used as headers.\n    \"\"\"\n    if value is None:\n        self.columns[key] = Column(self.path, value=None)\n    elif isinstance(value, (list, tuple)):\n        value = list_to_np_array(value)\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, (np.ndarray)):\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, Column):\n        self.columns[key] = value\n    else:\n        raise TypeError(f\"{type(value)} not supported.\")\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__getitem__","title":"<code>tablite.base.BaseTable.__getitem__(keys)</code>","text":"<p>Enables selection of columns and rows</p> PARAMETER  DESCRIPTION <code>keys</code> <p> TYPE: <code>column name, integer or slice</code> </p> <code>Examples</code> <p> </p> <code>&gt;&gt;&gt;</code> <p>10]                        selects first 10 rows from all columns</p> <p> TYPE: <code>table[</code> </p> <code>&gt;&gt;&gt;</code> <p>20:3] selects column 'b' and 'c' and 'a' twice for a slice.</p> <p> TYPE: <code>table['b', 'a', 'a', 'c', 2</code> </p> <p>Raises:     KeyError: if key is not found.     TypeError: if key is not a string, integer or slice.</p> RETURNS DESCRIPTION <code>Table</code> <p>returns columns in same order as selection.</p> Source code in <code>tablite/base.py</code> <pre><code>def __getitem__(self, keys):  # USER FUNCTION\n    \"\"\"\n    Enables selection of columns and rows\n\n    Args:\n        keys (column name, integer or slice):\n        Examples:\n        ```\n        &gt;&gt;&gt; table['a']                        selects column 'a'\n        &gt;&gt;&gt; table[3]                          selects row 3 as a tuple.\n        &gt;&gt;&gt; table[:10]                        selects first 10 rows from all columns\n        &gt;&gt;&gt; table['a','b', slice(3,20,2)]     selects a slice from columns 'a' and 'b'\n        &gt;&gt;&gt; table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n        &gt;&gt;&gt; table[('b', 'a', 'a', 'c')]       selects columns 'b', 'a', 'a', and 'c' using a tuple.\n        ```\n    Raises:\n        KeyError: if key is not found.\n        TypeError: if key is not a string, integer or slice.\n\n    Returns:\n        Table: returns columns in same order as selection.\n    \"\"\"\n\n    if not isinstance(keys, tuple):\n        if isinstance(keys, list):\n            keys = tuple(keys)\n        else:\n            keys = (keys,)\n    if isinstance(keys[0], tuple):\n        keys = tuple(list(chain(*keys)))\n\n    integers = [i for i in keys if isinstance(i, int)]\n    if len(integers) == len(keys) == 1:  # return a single tuple.\n        keys = [slice(keys[0])]\n\n    column_names = [i for i in keys if isinstance(i, str)]\n    column_names = list(self.columns) if not column_names else column_names\n    not_found = [name for name in column_names if name not in self.columns]\n    if not_found:\n        raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n    slices = [i for i in keys if isinstance(i, slice)]\n    slc = slice(0, len(self)) if not slices else slices[0]\n\n    if (\n        len(slices) == 0 and len(column_names) == 1\n    ):  # e.g. tbl['a'] or tbl['a'][:10]\n        col = self.columns[column_names[0]]\n        if slices:\n            return col[slc]  # return slice from column as list of values\n        else:\n            return col  # return whole column\n\n    elif len(integers) == 1:  # return a single tuple.\n        row_no = integers[0]\n        slc = slice(row_no, row_no + 1)\n        return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n    elif not slices:  # e.g. new table with N whole columns.\n        return self.__class__(\n            columns={name: self.columns[name] for name in column_names}\n        )\n\n    else:  # e.g. new table from selection of columns and slices.\n        t = self.__class__()\n        for name in column_names:\n            column = self.columns[name]\n\n            new_column = Column(t.path)  # create new Column.\n            for item in column.getpages(slc):\n                if isinstance(item, np.ndarray):\n                    new_column.extend(item)  # extend subslice (expensive)\n                elif isinstance(item, SimplePage):\n                    new_column.pages.append(item)  # extend page (cheap)\n                else:\n                    raise TypeError(f\"Bad item: {item}\")\n\n            # below:\n            # set the new column directly on t.columns.\n            # Do not use t[name] as that triggers __setitem__ again.\n            t.columns[name] = new_column\n\n        return t\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__len__","title":"<code>tablite.base.BaseTable.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):  # USER FUNCTION.\n    if not self.columns:\n        return 0\n    return max(len(c) for c in self.columns.values())\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__eq__","title":"<code>tablite.base.BaseTable.__eq__(other) -&gt; bool</code>","text":"<p>Determines if two tables have identical content.</p> PARAMETER  DESCRIPTION <code>other</code> <p>table for comparison</p> <p> TYPE: <code>Table</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>True if tables are identical.</p> <p> TYPE: <code>bool</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __eq__(self, other) -&gt; bool:  # USER FUNCTION.\n    \"\"\"Determines if two tables have identical content.\n\n    Args:\n        other (Table): table for comparison\n\n    Returns:\n        bool: True if tables are identical.\n    \"\"\"\n    if isinstance(other, dict):\n        return self.items() == other.items()\n    if not isinstance(other, BaseTable):\n        return False\n    if id(self) == id(other):\n        return True\n    if len(self) != len(other):\n        return False\n    if len(self) == len(other) == 0:\n        return True\n    if self.columns.keys() != other.columns.keys():\n        return False\n    for name, col in self.columns.items():\n        if not (col == other.columns[name]):\n            return False\n    return True\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.clear","title":"<code>tablite.base.BaseTable.clear()</code>","text":"<p>clears the table. Like dict().clear()</p> Source code in <code>tablite/base.py</code> <pre><code>def clear(self):  # USER FUNCTION.\n    \"\"\"clears the table. Like dict().clear()\"\"\"\n    self.columns.clear()\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.save","title":"<code>tablite.base.BaseTable.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1)</code>","text":"<p>saves table to compressed tpz file.</p> PARAMETER  DESCRIPTION <code>path</code> <p>file destination.</p> <p> TYPE: <code>Path</code> </p> <code>compression_method</code> <p>See zipfile compression methods. Defaults to ZIP_DEFLATED.</p> <p> DEFAULT: <code>ZIP_DEFLATED</code> </p> <code>compression_level</code> <p>See zipfile compression levels. Defaults to 1.</p> <p> DEFAULT: <code>1</code> </p> <p>The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files.</p> <p>The zip contains table.yml which provides an overview of the data:</p> <pre><code>--------------------------------------\n%YAML 1.2                              yaml version\ncolumns:                               start of columns section.\n    name: \u201c\u5217 1\u201d                       name of column 1.\n        pages: [p1b1, p1b2]            list of pages in column 1.\n    name: \u201c\u5217 2\u201d                       name of column 2\n        pages: [p2b1, p2b2]            list of pages in column 2.\n----------------------------------------\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def save(\n    self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n):  # USER FUNCTION.\n    \"\"\"saves table to compressed tpz file.\n\n    Args:\n        path (Path): file destination.\n        compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n        compression_level: See zipfile compression levels. Defaults to 1.\n        The default settings produce 80% compression at 10% slowdown.\n\n    The file format is as follows:\n    .tpz is a gzip archive with table metadata captured as table.yml\n    and the necessary set of pages saved as .npy files.\n\n    The zip contains table.yml which provides an overview of the data:\n    ```\n    --------------------------------------\n    %YAML 1.2                              yaml version\n    columns:                               start of columns section.\n        name: \u201c\u5217 1\u201d                       name of column 1.\n            pages: [p1b1, p1b2]            list of pages in column 1.\n        name: \u201c\u5217 2\u201d                       name of column 2\n            pages: [p2b1, p2b2]            list of pages in column 2.\n    ----------------------------------------\n    ```\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    type_check(path, Path)\n    if path.is_dir():\n        raise TypeError(f\"filename needed: {path}\")\n    if path.suffix != \".tpz\":\n        path = path.parent / (path.parts[-1] + \".tpz\")\n\n    # create yaml document\n    _page_counter = 0\n    d = {}\n    cols = {}\n    for name, col in self.columns.items():\n        type_check(col, Column)\n        cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n        _page_counter += len(col.pages)\n    d[\"columns\"] = cols\n    yml = yaml.safe_dump(\n        d, sort_keys=False, allow_unicode=True, default_flow_style=None\n    )\n\n    _file_counter = 0\n    with zipfile.ZipFile(\n        path, \"w\", compression=compression_method, compresslevel=compression_level\n    ) as f:\n        log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n        f.writestr(\"table.yml\", yml)\n        for name, col in self.columns.items():\n            for page in set(\n                col.pages\n            ):  # set of pages! remember t *= 1000 repeats t 1000x\n                with open(page.path, \"rb\", buffering=0) as raw_io:\n                    f.writestr(page.path.name, raw_io.read())\n                _file_counter += 1\n                log.debug(f\"adding Page {page.path}\")\n\n        _fields = len(self) * len(self.columns)\n        _avg = _fields // _page_counter\n        log.debug(\n            f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n        )\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.load","title":"<code>tablite.base.BaseTable.load(path, tqdm=_tqdm)</code>  <code>classmethod</code>","text":"<p>loads a table from .tpz file. See also Table.save for details on the file format.</p> PARAMETER  DESCRIPTION <code>path</code> <p>source file</p> <p> TYPE: <code>Path</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>table in read-only mode.</p> Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef load(cls, path, tqdm=_tqdm):  # USER FUNCTION.\n    \"\"\"loads a table from .tpz file.\n    See also Table.save for details on the file format.\n\n    Args:\n        path (Path): source file\n\n    Returns:\n        Table: table in read-only mode.\n    \"\"\"\n    path = Path(path)\n    log.debug(f\"loading {path}\")\n    with zipfile.ZipFile(path, \"r\") as f:\n        yml = f.read(\"table.yml\")\n        metadata = yaml.safe_load(yml)\n        t = cls()\n\n        page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n        with tqdm(\n            total=page_count,\n            desc=f\"loading '{path.name}' file\",\n            disable=Config.TQDM_DISABLE,\n        ) as pbar:\n            for name, d in metadata[\"columns\"].items():\n                column = Column(t.path)\n                for page in d[\"pages\"]:\n                    bytestream = io.BytesIO(f.read(page))\n                    data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n                    column.extend(data)\n                    pbar.update(1)\n                t.columns[name] = column\n    update_access_time(path)\n    return t\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.copy","title":"<code>tablite.base.BaseTable.copy()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def copy(self):\n    cls = type(self)\n    t = cls()\n    for name, column in self.columns.items():\n        new = Column(t.path)\n        new.pages = column.pages[:]\n        t.columns[name] = new\n    return t\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__imul__","title":"<code>tablite.base.BaseTable.__imul__(other)</code>","text":"<p>Repeats instance of table N times.</p> <p>Like list: <code>t = t * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __imul__(self, other):\n    \"\"\"Repeats instance of table N times.\n\n    Like list: `t = t * N`\n\n    Args:\n        other (int): multiplier\n    \"\"\"\n    if not (isinstance(other, int) and other &gt; 0):\n        raise TypeError(\n            f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    for col in self.columns.values():\n        col *= other\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__mul__","title":"<code>tablite.base.BaseTable.__mul__(other)</code>","text":"<p>Repeat table N times. Like list: <code>new = old * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __mul__(self, other):\n    \"\"\"Repeat table N times.\n    Like list: `new = old * N`\n\n    Args:\n        other (int): multiplier\n\n    Returns:\n        Table\n    \"\"\"\n    new = self.copy()\n    return new.__imul__(other)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__iadd__","title":"<code>tablite.base.BaseTable.__iadd__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_1 += table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <code>None</code> <p>self is updated.</p> Source code in <code>tablite/base.py</code> <pre><code>def __iadd__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_1 += table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        None: self is updated.\n    \"\"\"\n    type_check(other, BaseTable)\n    for name in self.columns.keys():\n        if name not in other.columns:\n            raise ValueError(f\"{name} not in other\")\n    for name in other.columns.keys():\n        if name not in self.columns:\n            raise ValueError(f\"{name} missing from self\")\n\n    for name, column in self.columns.items():\n        other_col = other.columns.get(name, None)\n        column.pages.extend(other_col.pages[:])\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__add__","title":"<code>tablite.base.BaseTable.__add__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_3 = table_1 + table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __add__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_3 = table_1 + table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        Table\n    \"\"\"\n    type_check(other, BaseTable)\n    cp = self.copy()\n    cp += other\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.add_rows","title":"<code>tablite.base.BaseTable.add_rows(*args, **kwargs)</code>","text":"<p>its more efficient to add many rows at once.</p> <p>if both args and kwargs, then args are added first, followed by kwargs.</p> <p>supported cases:</p> <pre><code>&gt;&gt;&gt; t = Table()\n&gt;&gt;&gt; t.add_columns('row','A','B','C')\n&gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n&gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n&gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n&gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n&gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n&gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n&gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n&gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n&gt;&gt;&gt; t.add_rows(\n    {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n    )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n&gt;&gt;&gt; t.add_rows( *[\n    {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n    ])                                                  # (10) list of dicts as args\n&gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def add_rows(self, *args, **kwargs):\n    \"\"\"its more efficient to add many rows at once.\n\n    if both args and kwargs, then args are added first, followed by kwargs.\n\n    supported cases:\n    ```\n    &gt;&gt;&gt; t = Table()\n    &gt;&gt;&gt; t.add_columns('row','A','B','C')\n    &gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n    &gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n    &gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n    &gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n    &gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n    &gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n    &gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n    &gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n    &gt;&gt;&gt; t.add_rows(\n        {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n        )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n    &gt;&gt;&gt; t.add_rows( *[\n        {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n        ])                                                  # (10) list of dicts as args\n    &gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n    ```\n\n    \"\"\"\n    if not BaseTable._add_row_slow_warning:\n        warnings.warn(\n            \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n        )\n        BaseTable._add_row_slow_warning = True\n\n    if args:\n        if not all(isinstance(i, (list, tuple, dict)) for i in args):  # 1,4\n            args = [args]\n\n        if all(isinstance(i, (list, tuple, dict)) for i in args):  # 2,3,7,8\n            # 1. turn the data into columns:\n\n            d = {n: [] for n in self.columns}\n            for arg in args:\n                if len(arg) != len(self.columns):\n                    raise ValueError(\n                        f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n                    )\n\n                if isinstance(arg, dict):\n                    for k, v in arg.items():  # 7,8\n                        d[k].append(v)\n\n                elif isinstance(arg, (list, tuple)):  # 2,3\n                    for n, v in zip(self.columns, arg):\n                        d[n].append(v)\n\n                else:\n                    raise TypeError(f\"{arg}?\")\n            # 2. extend the columns\n            for n, values in d.items():\n                col = self.columns[n]\n                col.extend(list_to_np_array(values))\n\n    if kwargs:\n        if isinstance(kwargs, dict):\n            if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(list_to_np_array(v))\n            else:\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(np.array([v]))\n        else:\n            raise ValueError(f\"format not recognised: {kwargs}\")\n\n    return\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.add_columns","title":"<code>tablite.base.BaseTable.add_columns(*names)</code>","text":"<p>Adds column names to table.</p> Source code in <code>tablite/base.py</code> <pre><code>def add_columns(self, *names):\n    \"\"\"Adds column names to table.\"\"\"\n    for name in names:\n        self.columns[name] = Column(self.path)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.add_column","title":"<code>tablite.base.BaseTable.add_column(name, data=None)</code>","text":"<p>verbose alias for table[name] = data, that checks if name already exists</p> PARAMETER  DESCRIPTION <code>name</code> <p>column name</p> <p> TYPE: <code>str</code> </p> <code>data</code> <p>values. Defaults to None.</p> <p> TYPE: <code>list,tuple)</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>name isn't string</p> <code>ValueError</code> <p>name already exists</p> Source code in <code>tablite/base.py</code> <pre><code>def add_column(self, name, data=None):\n    \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n    Args:\n        name (str): column name\n        data ((list,tuple), optional): values. Defaults to None.\n\n    Raises:\n        TypeError: name isn't string\n        ValueError: name already exists\n    \"\"\"\n    if not isinstance(name, str):\n        raise TypeError(\"expected name as string\")\n    if name in self.columns:\n        raise ValueError(f\"{name} already in {self.columns}\")\n    self.__setitem__(name, data)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.stack","title":"<code>tablite.base.BaseTable.stack(other)</code>","text":"<p>returns the joint stack of tables with overlapping column names. Example:</p> <pre><code>| Table A|  +  | Table B| = |  Table AB |\n| A| B| C|     | A| B| D|   | A| B| C| -|\n                            | A| B| -| D|\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def stack(self, other):\n    \"\"\"\n    returns the joint stack of tables with overlapping column names.\n    Example:\n    ```\n    | Table A|  +  | Table B| = |  Table AB |\n    | A| B| C|     | A| B| D|   | A| B| C| -|\n                                | A| B| -| D|\n    ```\n    \"\"\"\n    if not isinstance(other, BaseTable):\n        raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n    cp = self.copy()\n    for name, col2 in other.columns.items():\n        if name not in cp.columns:\n            cp[name] = [None] * len(self)\n        cp[name].pages.extend(col2.pages[:])\n\n    for name in self.columns:\n        if name not in other.columns:\n            if len(cp) &gt; 0:\n                cp[name].extend(np.array([None] * len(other)))\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.types","title":"<code>tablite.base.BaseTable.types()</code>","text":"<p>returns nested dict of data types in the form: <code>{column name: {python type class: number of instances }, ... }</code></p> <p>example:</p> <pre><code>&gt;&gt;&gt; t.types()\n{\n    'A': {&lt;class 'str'&gt;: 7},\n    'B': {&lt;class 'int'&gt;: 7}\n}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def types(self):\n    \"\"\"\n    returns nested dict of data types in the form:\n    `{column name: {python type class: number of instances }, ... }`\n\n    example:\n    ```\n    &gt;&gt;&gt; t.types()\n    {\n        'A': {&lt;class 'str'&gt;: 7},\n        'B': {&lt;class 'int'&gt;: 7}\n    }\n    ```\n    \"\"\"\n    d = {}\n    for name, col in self.columns.items():\n        assert isinstance(col, Column)\n        d[name] = col.types()\n    return d\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.display_dict","title":"<code>tablite.base.BaseTable.display_dict(slice_=None, blanks=None, dtype=False)</code>","text":"<p>helper for creating dict for display.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>python slice. Defaults to None.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>fill value for <code>None</code>. Defaults to None.</p> <p> TYPE: <code>optional</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>Adds datatype to each column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>slice_ must be None or slice.</p> RETURNS DESCRIPTION <code>dict</code> <p>from Table.</p> Source code in <code>tablite/base.py</code> <pre><code>def display_dict(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"helper for creating dict for display.\n\n    Args:\n        slice_ (slice, optional): python slice. Defaults to None.\n        blanks (optional): fill value for `None`. Defaults to None.\n        dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n    Raises:\n        TypeError: slice_ must be None or slice.\n\n    Returns:\n        dict: from Table.\n    \"\"\"\n    if not self.columns:\n        print(\"Empty Table\")\n        return\n\n    def datatype(col):  # PRIVATE\n        \"\"\"creates label for column datatype.\"\"\"\n        types = col.types()\n        if len(types) == 0:\n            typ = \"empty\"\n        elif len(types) == 1:\n            dt, _ = types.popitem()\n            typ = dt.__name__\n        else:\n            typ = \"mixed\"\n        return typ\n\n    row_count_tags = [\"#\", \"~\", \"*\"]\n    cols = set(self.columns)\n    for n, tag in product(range(1, 6), row_count_tags):\n        if n * tag not in cols:\n            tag = n * tag\n            break\n\n    if not isinstance(slice_, (slice, type(None))):\n        raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n    if isinstance(slice_, slice):\n        slc = slice_\n    if slice_ is None:\n        if len(self) &lt;= 20:\n            slc = slice(0, 20, 1)\n        else:\n            slc = None\n\n    n = len(self)\n    if slc:  # either we want slc or we want everything.\n        row_no = list(range(*slc.indices(len(self))))\n        data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n        for name, col in self.columns.items():\n            data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n                slc\n            ]\n    else:\n        data = {}\n        j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n        row_no = (\n            [f\"{i:,}\".rjust(j) for i in range(7)]\n            + [\"...\"]\n            + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n        )\n        data = {tag: row_no}\n\n        for name, col in self.columns.items():\n            if len(col) == n:\n                row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n            else:\n                empty = [blanks] * 7\n                head = (col[:7].tolist() + empty)[:7]\n                tail = (col[n - 7 :].tolist() + empty)[-7:]\n                row = head + [\"...\"] + tail\n            data[name] = row\n\n    if dtype:\n        for name, values in data.items():\n            if name in self.columns:\n                col = self.columns[name]\n                values.insert(0, datatype(col))\n            else:\n                values.insert(0, \"row\")\n\n    return data\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.to_ascii","title":"<code>tablite.base.BaseTable.to_ascii(slice_=None, blanks=None, dtype=False)</code>","text":"<p>returns ascii view of table as string.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def to_ascii(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"returns ascii view of table as string.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n\n    def adjust(v, length):  # PRIVATE FUNCTION\n        \"\"\"whitespace justifies field values based on datatype\"\"\"\n        if v is None:\n            return str(blanks).ljust(length)\n        elif isinstance(v, str):\n            return v.ljust(length)\n        else:\n            return str(v).rjust(length)\n\n    if not self.columns:\n        return str(self)\n\n    d = {}\n    for name, values in self.display_dict(\n        slice_=slice_, blanks=blanks, dtype=dtype\n    ).items():\n        as_text = [str(v) for v in values] + [str(name)]\n        width = max(len(i) for i in as_text)\n        new_name = name.center(width, \" \")\n        if dtype:\n            values[0] = values[0].center(width, \" \")\n        d[new_name] = [adjust(v, width) for v in values]\n\n    rows = dict_to_rows(d)\n    s = []\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n    s.append(\"|\" + \"|\".join(rows[0]) + \"|\")  # column names\n    start = 1\n    if dtype:\n        s.append(\"|\" + \"|\".join(rows[1]) + \"|\")  # datatypes\n        start = 2\n\n    s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n    for row in rows[start:]:\n        s.append(\"|\" + \"|\".join(row) + \"|\")\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n    if len(set(len(c) for c in self.columns.values())) != 1:\n        warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n        s.append(warning)\n\n    return \"\\n\".join(s)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.show","title":"<code>tablite.base.BaseTable.show(slice_=None, blanks=None, dtype=False)</code>","text":"<p>prints ascii view of table.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def show(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"prints ascii view of table.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n    print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.to_dict","title":"<code>tablite.base.BaseTable.to_dict(columns=None, slice_=None)</code>","text":"<p>columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows.</p> <p>returns: dict with columns as keys and lists of values.</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 |  1|  3|\n| 1 |  2|  4|\n+===+===+===+\n&gt;&gt;&gt; t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def to_dict(self, columns=None, slice_=None):\n    \"\"\"\n    columns: list of column names. Default is None == all columns.\n    slice_: slice. Default is None == all rows.\n\n    returns: dict with columns as keys and lists of values.\n\n    Example:\n    ```\n    &gt;&gt;&gt; t.show()\n    +===+===+===+\n    | # | a | b |\n    |row|int|int|\n    +---+---+---+\n    | 0 |  1|  3|\n    | 1 |  2|  4|\n    +===+===+===+\n    &gt;&gt;&gt; t.to_dict()\n    {'a':[1,2], 'b':[3,4]}\n    ```\n\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n    assert isinstance(slice_, slice)\n\n    if columns is None:\n        columns = list(self.columns.keys())\n    if not isinstance(columns, list):\n        raise TypeError(\"expected columns as list of strings\")\n\n    return {name: list(self.columns[name][slice_]) for name in columns}\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.as_json_serializable","title":"<code>tablite.base.BaseTable.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None)</code>","text":"<p>provides a JSON compatible format of the table.</p> PARAMETER  DESCRIPTION <code>row_count</code> <p>Label for row counts. Defaults to \"row id\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'row id'</code> </p> <code>start_on</code> <p>row counts starts by default on 1.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1</code> </p> <code>columns</code> <p>Column names. Defaults to None which returns all columns.</p> <p> TYPE: <code>list of str</code> DEFAULT: <code>None</code> </p> <code>slice_</code> <p>selector. Defaults to None which returns [:]</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <p>JSON serializable dict: All python datatypes have been converted to JSON compliant data.</p> Source code in <code>tablite/base.py</code> <pre><code>def as_json_serializable(\n    self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n    \"\"\"provides a JSON compatible format of the table.\n\n    Args:\n        row_count (str, optional): Label for row counts. Defaults to \"row id\".\n        start_on (int, optional): row counts starts by default on 1.\n        columns (list of str, optional): Column names.\n            Defaults to None which returns all columns.\n        slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n    Returns:\n        JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n\n    assert isinstance(slice_, slice)\n    new = {\"columns\": {}, \"total_rows\": len(self)}\n    if row_count is not None:\n        new[\"columns\"][row_count] = [\n            i + start_on for i in range(*slice_.indices(len(self)))\n        ]\n\n    d = self.to_dict(columns, slice_=slice_)\n    for k, data in d.items():\n        new_k = unique_name(\n            k, new[\"columns\"]\n        )  # used to avoid overwriting the `row id` key.\n        new[\"columns\"][new_k] = [\n            DataTypes.to_json(v) for v in data\n        ]  # deal with non-json datatypes.\n    return new\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.index","title":"<code>tablite.base.BaseTable.index(*args)</code>","text":"<p>param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; table6 = Table()\n&gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n&gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A')  # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def index(self, *args):\n    \"\"\"\n    param: *args: column names\n    returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n    Examples:\n        ```\n        &gt;&gt;&gt; table6 = Table()\n        &gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n        &gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A')  # single key.\n        {('Alice',): [0],\n         ('Bob',): [1, 2],\n         ('Ben',): [3, 5],\n         ('Charlie',): [4],\n         ('Albert',): [6]})\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n        {('Alice', 'Alison'): [0],\n         ('Bob', 'Marley'): [1],\n         ('Bob', 'Dylan'): [2],\n         ('Ben', 'Affleck'): [3],\n         ('Charlie', 'Hepburn'): [4],\n         ('Ben', 'Barnes'): [5],\n         ('Albert', 'Einstein'): [6]})\n        ```\n\n    \"\"\"\n    idx = defaultdict(list)\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in enumerate(zip(*iterators)):\n        key = tuple(numpy_to_python(k) for k in key)\n        idx[key].append(ix)\n    return idx\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.unique_index","title":"<code>tablite.base.BaseTable.unique_index(*args, tqdm=_tqdm)</code>","text":"<p>generates the index of unique rows given a list of column names</p> PARAMETER  DESCRIPTION <code>*args</code> <p>columns names</p> <p> TYPE: <code>any</code> DEFAULT: <code>()</code> </p> <code>tqdm</code> <p>Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <p>np.array(int64): indices of unique records.</p> Source code in <code>tablite/base.py</code> <pre><code>def unique_index(self, *args, tqdm=_tqdm):\n    \"\"\"generates the index of unique rows given a list of column names\n\n    Args:\n        *args (any): columns names\n        tqdm (tqdm, optional): Defaults to _tqdm.\n\n    Returns:\n        np.array(int64): indices of unique records.\n    \"\"\"\n    if not args:\n        raise ValueError(\"*args (column names) is required\")\n    seen = set()\n    unique = set()\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n        key_hash = hash(tuple(numpy_to_python(k) for k in key))\n        if key_hash in seen:\n            continue\n        else:\n            seen.add(key_hash)\n            unique.add(ix)\n    return np.array(sorted(unique))\n</code></pre>"},{"location":"reference/base/#tablite.base-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.register","title":"<code>tablite.base.register(path)</code>","text":"<p>registers path in file_registry</p> <p>The method is used by Table during init when the working directory path is set, so that python can clean all temporary files up at exit.</p> PARAMETER  DESCRIPTION <code>path</code> <p>typically tmp/tablite-tmp/PID-{os.getpid()}</p> <p> TYPE: <code>Path</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def register(path):\n    \"\"\"registers path in file_registry\n\n    The method is used by Table during init when the working directory path\n    is set, so that python can clean all temporary files up at exit.\n\n    Args:\n        path (Path): typically tmp/tablite-tmp/PID-{os.getpid()}\n    \"\"\"\n    global file_registry\n    file_registry.add(path)\n</code></pre>"},{"location":"reference/base/#tablite.base.shutdown","title":"<code>tablite.base.shutdown()</code>","text":"<p>method to clean up temporary files triggered at shutdown.</p> Source code in <code>tablite/base.py</code> <pre><code>def shutdown():\n    \"\"\"method to clean up temporary files triggered at shutdown.\"\"\"\n    for path in file_registry:\n        if Config.pid in str(path):  # safety feature to prevent rm -rf /\n            log.debug(f\"shutdown: running rmtree({path})\")\n            shutil.rmtree(path)\n</code></pre>"},{"location":"reference/config/","title":"Config","text":""},{"location":"reference/config/#tablite.config","title":"<code>tablite.config</code>","text":""},{"location":"reference/config/#tablite.config-classes","title":"Classes","text":""},{"location":"reference/config/#tablite.config.Config","title":"<code>tablite.config.Config</code>","text":"<p>             Bases: <code>object</code></p> <p>Config class for Tablite Tables.</p> <p>The default location for the storage is loaded as</p> <pre><code>Config.workdir = pathlib.Path(os.environ.get(\"TABLITE_TMPDIR\", f\"{tempfile.gettempdir()}/tablite-tmp\"))\n</code></pre> <p>to overwrite, first import the config class, then set the new workdir.</p> <pre><code>&gt;&gt;&gt; from tablite import config\n&gt;&gt;&gt; from pathlib import Path\n&gt;&gt;&gt; config.workdir = Path(\"/this/new/location\")\n</code></pre> <p>the new path will now be used for every new table.</p> <p>PAGE_SIZE = 1_000_000 sets the page size limit.</p> <p>Multiprocessing is enabled in one of three modes: AUTO = \"auto\" FALSE = \"sp\" FORCE = \"mp\"</p> <p>MULTIPROCESSING_MODE = AUTO  is default.</p> <p>SINGLE_PROCESSING_LIMIT = 1_000_000 when the number of fields (rows x columns) exceed this value, multiprocessing is used.</p>"},{"location":"reference/config/#tablite.config.Config-attributes","title":"Attributes","text":""},{"location":"reference/config/#tablite.config.Config.USE_NIMPORTER","title":"<code>tablite.config.Config.USE_NIMPORTER = os.environ.get('USE_NIMPORTER', 'true').lower() in ['1', 't', 'true', 'y', 'yes']</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH","title":"<code>tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH = os.environ.get('ALLOW_CSV_READER_FALLTHROUGH', 'true').lower() in ['1', 't', 'true', 'y', 'yes']</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.NIM_SUPPORTED_CONV_TYPES","title":"<code>tablite.config.Config.NIM_SUPPORTED_CONV_TYPES = ['Windows-1252', 'ISO-8859-1']</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.workdir","title":"<code>tablite.config.Config.workdir = pathlib.Path(os.environ.get('TABLITE_TMPDIR', f'{tempfile.gettempdir()}/tablite-tmp'))</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.pid","title":"<code>tablite.config.Config.pid = f'pid-{os.getpid()}'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.PAGE_SIZE","title":"<code>tablite.config.Config.PAGE_SIZE = 1000000</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.ENCODING","title":"<code>tablite.config.Config.ENCODING = 'UTF-8'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.DISK_LIMIT","title":"<code>tablite.config.Config.DISK_LIMIT = int(10000000000.0)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":"<p>10e9 (10Gb) on 100 Gb disk means raise at 90 Gb disk usage. if DISK_LIMIT &lt;= 0, the check is turned off.</p>"},{"location":"reference/config/#tablite.config.Config.SINGLE_PROCESSING_LIMIT","title":"<code>tablite.config.Config.SINGLE_PROCESSING_LIMIT = 1000000</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":"<p>when the number of fields (rows x columns) exceed this value, multiprocessing is used.</p>"},{"location":"reference/config/#tablite.config.Config.vpus","title":"<code>tablite.config.Config.vpus = max(os.cpu_count() - 1, 1)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.AUTO","title":"<code>tablite.config.Config.AUTO = 'auto'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.FALSE","title":"<code>tablite.config.Config.FALSE = 'sp'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.FORCE","title":"<code>tablite.config.Config.FORCE = 'mp'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.MULTIPROCESSING_MODE","title":"<code>tablite.config.Config.MULTIPROCESSING_MODE = AUTO</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.TQDM_DISABLE","title":"<code>tablite.config.Config.TQDM_DISABLE = False</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config-functions","title":"Functions","text":""},{"location":"reference/config/#tablite.config.Config.reset","title":"<code>tablite.config.Config.reset()</code>  <code>classmethod</code>","text":"<p>Resets the config class to original values.</p> Source code in <code>tablite/config.py</code> <pre><code>@classmethod\ndef reset(cls):\n    \"\"\"Resets the config class to original values.\"\"\"\n    for k, v in _default_values.items():\n        setattr(Config, k, v)\n</code></pre>"},{"location":"reference/config/#tablite.config.Config.page_steps","title":"<code>tablite.config.Config.page_steps(length)</code>  <code>classmethod</code>","text":"<p>an iterator that yield start and end in page sizes</p> YIELDS DESCRIPTION <code>tuple</code> <p>start:int, end:int</p> Source code in <code>tablite/config.py</code> <pre><code>@classmethod\ndef page_steps(cls, length):\n    \"\"\"an iterator that yield start and end in page sizes\n\n    Yields:\n        tuple: start:int, end:int\n    \"\"\"\n    start, end = 0, 0\n    for _ in range(0, length + 1, cls.PAGE_SIZE):\n        start, end = end, min(end + cls.PAGE_SIZE, length)\n        yield start, end\n        if end == length:\n            return\n</code></pre>"},{"location":"reference/core/","title":"Core","text":""},{"location":"reference/core/#tablite.core","title":"<code>tablite.core</code>","text":""},{"location":"reference/core/#tablite.core-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.log","title":"<code>tablite.core.log = logging.getLogger(__name__)</code>  <code>module-attribute</code>","text":""},{"location":"reference/core/#tablite.core-classes","title":"Classes","text":""},{"location":"reference/core/#tablite.core.Table","title":"<code>tablite.core.Table(columns=None, headers=None, rows=None, _path=None)</code>","text":"<p>             Bases: <code>BaseTable</code></p> <p>creates Table</p> PARAMETER  DESCRIPTION <code>EITHER</code> <p>columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})</p> <p> </p> Source code in <code>tablite/core.py</code> <pre><code>def __init__(self, columns=None, headers=None, rows=None, _path=None) -&gt; None:\n    \"\"\"creates Table\n\n    Args:\n        EITHER:\n            columns (dict, optional): dict with column names as keys, values as lists.\n            Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n        OR\n            headers (list of strings, optional): list of column names.\n            rows (list of tuples or lists, optional): values for columns\n            Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n    \"\"\"\n    super().__init__(columns, headers, rows, _path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.Table.path","title":"<code>tablite.core.Table.path = _path</code>  <code>instance-attribute</code>","text":""},{"location":"reference/core/#tablite.core.Table.columns","title":"<code>tablite.core.Table.columns = {}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/core/#tablite.core.Table.rows","title":"<code>tablite.core.Table.rows</code>  <code>property</code>","text":"<p>enables row based iteration in python types.</p> <p>Example:</p> <pre><code>for row in Table.rows:\n    print(row)\n</code></pre> <p>Yields:     tuple: values is same order as columns.</p>"},{"location":"reference/core/#tablite.core.Table-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core.Table.__str__","title":"<code>tablite.core.Table.__str__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __str__(self):  # USER FUNCTION.\n    return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__repr__","title":"<code>tablite.core.Table.__repr__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self):\n    return self.__str__()\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.nbytes","title":"<code>tablite.core.Table.nbytes()</code>","text":"<p>finds the total bytes of the table on disk</p> RETURNS DESCRIPTION <code>tuple</code> <p>int: real bytes used on disk int: total bytes used if flattened</p> Source code in <code>tablite/base.py</code> <pre><code>def nbytes(self):  # USER FUNCTION.\n    \"\"\"finds the total bytes of the table on disk\n\n    Returns:\n        tuple:\n            int: real bytes used on disk\n            int: total bytes used if flattened\n    \"\"\"\n    real = {}\n    total = 0\n    for column in self.columns.values():\n        for page in set(column.pages):\n            real[page] = page.path.stat().st_size\n        for page in column.pages:\n            total += real[page]\n    return sum(real.values()), total\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.items","title":"<code>tablite.core.Table.items()</code>","text":"<p>returns table as dict</p> RETURNS DESCRIPTION <code>dict</code> <p>Table as dict <code>{column_name: [values], ...}</code></p> Source code in <code>tablite/base.py</code> <pre><code>def items(self):  # USER FUNCTION.\n    \"\"\"returns table as dict\n\n    Returns:\n        dict: Table as dict `{column_name: [values], ...}`\n    \"\"\"\n    return {\n        name: column[:].tolist() for name, column in self.columns.items()\n    }.items()\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__delitem__","title":"<code>tablite.core.Table.__delitem__(key)</code>","text":"<p>Examples:</p> <pre><code>&gt;&gt;&gt; del table['a']  # removes column 'a'\n&gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __delitem__(self, key):  # USER FUNCTION.\n    \"\"\"\n    Examples:\n    ```\n    &gt;&gt;&gt; del table['a']  # removes column 'a'\n    &gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n    ```\n    \"\"\"\n    if isinstance(key, (int, slice)):\n        for column in self.columns.values():\n            del column[key]\n    elif key in self.columns:\n        del self.columns[key]\n    else:\n        raise KeyError(f\"Key not found: {key}\")\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__setitem__","title":"<code>tablite.core.Table.__setitem__(key, value)</code>","text":"<p>table behaves like a dict. Args:     key (str or hashable): column name     value (iterable): list, tuple or nd.array with values.</p> <p>As Table now accepts the keyword <code>columns</code> as a dict:</p> <pre><code>&gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n</code></pre> <p>and the header/data combinations:</p> <pre><code>&gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n</code></pre> <p>This has the side-benefit that tuples now can be used as headers.</p> Source code in <code>tablite/base.py</code> <pre><code>def __setitem__(self, key, value):  # USER FUNCTION\n    \"\"\"table behaves like a dict.\n    Args:\n        key (str or hashable): column name\n        value (iterable): list, tuple or nd.array with values.\n\n    As Table now accepts the keyword `columns` as a dict:\n    ```\n    &gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n    ```\n    and the header/data combinations:\n    ```\n    &gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n    ```\n    This has the side-benefit that tuples now can be used as headers.\n    \"\"\"\n    if value is None:\n        self.columns[key] = Column(self.path, value=None)\n    elif isinstance(value, (list, tuple)):\n        value = list_to_np_array(value)\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, (np.ndarray)):\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, Column):\n        self.columns[key] = value\n    else:\n        raise TypeError(f\"{type(value)} not supported.\")\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__getitem__","title":"<code>tablite.core.Table.__getitem__(keys)</code>","text":"<p>Enables selection of columns and rows</p> PARAMETER  DESCRIPTION <code>keys</code> <p> TYPE: <code>column name, integer or slice</code> </p> <code>Examples</code> <p> </p> <code>&gt;&gt;&gt;</code> <p>10]                        selects first 10 rows from all columns</p> <p> TYPE: <code>table[</code> </p> <code>&gt;&gt;&gt;</code> <p>20:3] selects column 'b' and 'c' and 'a' twice for a slice.</p> <p> TYPE: <code>table['b', 'a', 'a', 'c', 2</code> </p> <p>Raises:     KeyError: if key is not found.     TypeError: if key is not a string, integer or slice.</p> RETURNS DESCRIPTION <code>Table</code> <p>returns columns in same order as selection.</p> Source code in <code>tablite/base.py</code> <pre><code>def __getitem__(self, keys):  # USER FUNCTION\n    \"\"\"\n    Enables selection of columns and rows\n\n    Args:\n        keys (column name, integer or slice):\n        Examples:\n        ```\n        &gt;&gt;&gt; table['a']                        selects column 'a'\n        &gt;&gt;&gt; table[3]                          selects row 3 as a tuple.\n        &gt;&gt;&gt; table[:10]                        selects first 10 rows from all columns\n        &gt;&gt;&gt; table['a','b', slice(3,20,2)]     selects a slice from columns 'a' and 'b'\n        &gt;&gt;&gt; table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n        &gt;&gt;&gt; table[('b', 'a', 'a', 'c')]       selects columns 'b', 'a', 'a', and 'c' using a tuple.\n        ```\n    Raises:\n        KeyError: if key is not found.\n        TypeError: if key is not a string, integer or slice.\n\n    Returns:\n        Table: returns columns in same order as selection.\n    \"\"\"\n\n    if not isinstance(keys, tuple):\n        if isinstance(keys, list):\n            keys = tuple(keys)\n        else:\n            keys = (keys,)\n    if isinstance(keys[0], tuple):\n        keys = tuple(list(chain(*keys)))\n\n    integers = [i for i in keys if isinstance(i, int)]\n    if len(integers) == len(keys) == 1:  # return a single tuple.\n        keys = [slice(keys[0])]\n\n    column_names = [i for i in keys if isinstance(i, str)]\n    column_names = list(self.columns) if not column_names else column_names\n    not_found = [name for name in column_names if name not in self.columns]\n    if not_found:\n        raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n    slices = [i for i in keys if isinstance(i, slice)]\n    slc = slice(0, len(self)) if not slices else slices[0]\n\n    if (\n        len(slices) == 0 and len(column_names) == 1\n    ):  # e.g. tbl['a'] or tbl['a'][:10]\n        col = self.columns[column_names[0]]\n        if slices:\n            return col[slc]  # return slice from column as list of values\n        else:\n            return col  # return whole column\n\n    elif len(integers) == 1:  # return a single tuple.\n        row_no = integers[0]\n        slc = slice(row_no, row_no + 1)\n        return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n    elif not slices:  # e.g. new table with N whole columns.\n        return self.__class__(\n            columns={name: self.columns[name] for name in column_names}\n        )\n\n    else:  # e.g. new table from selection of columns and slices.\n        t = self.__class__()\n        for name in column_names:\n            column = self.columns[name]\n\n            new_column = Column(t.path)  # create new Column.\n            for item in column.getpages(slc):\n                if isinstance(item, np.ndarray):\n                    new_column.extend(item)  # extend subslice (expensive)\n                elif isinstance(item, SimplePage):\n                    new_column.pages.append(item)  # extend page (cheap)\n                else:\n                    raise TypeError(f\"Bad item: {item}\")\n\n            # below:\n            # set the new column directly on t.columns.\n            # Do not use t[name] as that triggers __setitem__ again.\n            t.columns[name] = new_column\n\n        return t\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__len__","title":"<code>tablite.core.Table.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):  # USER FUNCTION.\n    if not self.columns:\n        return 0\n    return max(len(c) for c in self.columns.values())\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__eq__","title":"<code>tablite.core.Table.__eq__(other) -&gt; bool</code>","text":"<p>Determines if two tables have identical content.</p> PARAMETER  DESCRIPTION <code>other</code> <p>table for comparison</p> <p> TYPE: <code>Table</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>True if tables are identical.</p> <p> TYPE: <code>bool</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __eq__(self, other) -&gt; bool:  # USER FUNCTION.\n    \"\"\"Determines if two tables have identical content.\n\n    Args:\n        other (Table): table for comparison\n\n    Returns:\n        bool: True if tables are identical.\n    \"\"\"\n    if isinstance(other, dict):\n        return self.items() == other.items()\n    if not isinstance(other, BaseTable):\n        return False\n    if id(self) == id(other):\n        return True\n    if len(self) != len(other):\n        return False\n    if len(self) == len(other) == 0:\n        return True\n    if self.columns.keys() != other.columns.keys():\n        return False\n    for name, col in self.columns.items():\n        if not (col == other.columns[name]):\n            return False\n    return True\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.clear","title":"<code>tablite.core.Table.clear()</code>","text":"<p>clears the table. Like dict().clear()</p> Source code in <code>tablite/base.py</code> <pre><code>def clear(self):  # USER FUNCTION.\n    \"\"\"clears the table. Like dict().clear()\"\"\"\n    self.columns.clear()\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.save","title":"<code>tablite.core.Table.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1)</code>","text":"<p>saves table to compressed tpz file.</p> PARAMETER  DESCRIPTION <code>path</code> <p>file destination.</p> <p> TYPE: <code>Path</code> </p> <code>compression_method</code> <p>See zipfile compression methods. Defaults to ZIP_DEFLATED.</p> <p> DEFAULT: <code>ZIP_DEFLATED</code> </p> <code>compression_level</code> <p>See zipfile compression levels. Defaults to 1.</p> <p> DEFAULT: <code>1</code> </p> <p>The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files.</p> <p>The zip contains table.yml which provides an overview of the data:</p> <pre><code>--------------------------------------\n%YAML 1.2                              yaml version\ncolumns:                               start of columns section.\n    name: \u201c\u5217 1\u201d                       name of column 1.\n        pages: [p1b1, p1b2]            list of pages in column 1.\n    name: \u201c\u5217 2\u201d                       name of column 2\n        pages: [p2b1, p2b2]            list of pages in column 2.\n----------------------------------------\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def save(\n    self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n):  # USER FUNCTION.\n    \"\"\"saves table to compressed tpz file.\n\n    Args:\n        path (Path): file destination.\n        compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n        compression_level: See zipfile compression levels. Defaults to 1.\n        The default settings produce 80% compression at 10% slowdown.\n\n    The file format is as follows:\n    .tpz is a gzip archive with table metadata captured as table.yml\n    and the necessary set of pages saved as .npy files.\n\n    The zip contains table.yml which provides an overview of the data:\n    ```\n    --------------------------------------\n    %YAML 1.2                              yaml version\n    columns:                               start of columns section.\n        name: \u201c\u5217 1\u201d                       name of column 1.\n            pages: [p1b1, p1b2]            list of pages in column 1.\n        name: \u201c\u5217 2\u201d                       name of column 2\n            pages: [p2b1, p2b2]            list of pages in column 2.\n    ----------------------------------------\n    ```\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    type_check(path, Path)\n    if path.is_dir():\n        raise TypeError(f\"filename needed: {path}\")\n    if path.suffix != \".tpz\":\n        path = path.parent / (path.parts[-1] + \".tpz\")\n\n    # create yaml document\n    _page_counter = 0\n    d = {}\n    cols = {}\n    for name, col in self.columns.items():\n        type_check(col, Column)\n        cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n        _page_counter += len(col.pages)\n    d[\"columns\"] = cols\n    yml = yaml.safe_dump(\n        d, sort_keys=False, allow_unicode=True, default_flow_style=None\n    )\n\n    _file_counter = 0\n    with zipfile.ZipFile(\n        path, \"w\", compression=compression_method, compresslevel=compression_level\n    ) as f:\n        log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n        f.writestr(\"table.yml\", yml)\n        for name, col in self.columns.items():\n            for page in set(\n                col.pages\n            ):  # set of pages! remember t *= 1000 repeats t 1000x\n                with open(page.path, \"rb\", buffering=0) as raw_io:\n                    f.writestr(page.path.name, raw_io.read())\n                _file_counter += 1\n                log.debug(f\"adding Page {page.path}\")\n\n        _fields = len(self) * len(self.columns)\n        _avg = _fields // _page_counter\n        log.debug(\n            f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n        )\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.load","title":"<code>tablite.core.Table.load(path, tqdm=_tqdm)</code>  <code>classmethod</code>","text":"<p>loads a table from .tpz file. See also Table.save for details on the file format.</p> PARAMETER  DESCRIPTION <code>path</code> <p>source file</p> <p> TYPE: <code>Path</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>table in read-only mode.</p> Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef load(cls, path, tqdm=_tqdm):  # USER FUNCTION.\n    \"\"\"loads a table from .tpz file.\n    See also Table.save for details on the file format.\n\n    Args:\n        path (Path): source file\n\n    Returns:\n        Table: table in read-only mode.\n    \"\"\"\n    path = Path(path)\n    log.debug(f\"loading {path}\")\n    with zipfile.ZipFile(path, \"r\") as f:\n        yml = f.read(\"table.yml\")\n        metadata = yaml.safe_load(yml)\n        t = cls()\n\n        page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n        with tqdm(\n            total=page_count,\n            desc=f\"loading '{path.name}' file\",\n            disable=Config.TQDM_DISABLE,\n        ) as pbar:\n            for name, d in metadata[\"columns\"].items():\n                column = Column(t.path)\n                for page in d[\"pages\"]:\n                    bytestream = io.BytesIO(f.read(page))\n                    data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n                    column.extend(data)\n                    pbar.update(1)\n                t.columns[name] = column\n    update_access_time(path)\n    return t\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.copy","title":"<code>tablite.core.Table.copy()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def copy(self):\n    cls = type(self)\n    t = cls()\n    for name, column in self.columns.items():\n        new = Column(t.path)\n        new.pages = column.pages[:]\n        t.columns[name] = new\n    return t\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__imul__","title":"<code>tablite.core.Table.__imul__(other)</code>","text":"<p>Repeats instance of table N times.</p> <p>Like list: <code>t = t * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __imul__(self, other):\n    \"\"\"Repeats instance of table N times.\n\n    Like list: `t = t * N`\n\n    Args:\n        other (int): multiplier\n    \"\"\"\n    if not (isinstance(other, int) and other &gt; 0):\n        raise TypeError(\n            f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    for col in self.columns.values():\n        col *= other\n    return self\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__mul__","title":"<code>tablite.core.Table.__mul__(other)</code>","text":"<p>Repeat table N times. Like list: <code>new = old * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __mul__(self, other):\n    \"\"\"Repeat table N times.\n    Like list: `new = old * N`\n\n    Args:\n        other (int): multiplier\n\n    Returns:\n        Table\n    \"\"\"\n    new = self.copy()\n    return new.__imul__(other)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__iadd__","title":"<code>tablite.core.Table.__iadd__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_1 += table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <code>None</code> <p>self is updated.</p> Source code in <code>tablite/base.py</code> <pre><code>def __iadd__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_1 += table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        None: self is updated.\n    \"\"\"\n    type_check(other, BaseTable)\n    for name in self.columns.keys():\n        if name not in other.columns:\n            raise ValueError(f\"{name} not in other\")\n    for name in other.columns.keys():\n        if name not in self.columns:\n            raise ValueError(f\"{name} missing from self\")\n\n    for name, column in self.columns.items():\n        other_col = other.columns.get(name, None)\n        column.pages.extend(other_col.pages[:])\n    return self\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__add__","title":"<code>tablite.core.Table.__add__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_3 = table_1 + table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __add__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_3 = table_1 + table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        Table\n    \"\"\"\n    type_check(other, BaseTable)\n    cp = self.copy()\n    cp += other\n    return cp\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.add_rows","title":"<code>tablite.core.Table.add_rows(*args, **kwargs)</code>","text":"<p>its more efficient to add many rows at once.</p> <p>if both args and kwargs, then args are added first, followed by kwargs.</p> <p>supported cases:</p> <pre><code>&gt;&gt;&gt; t = Table()\n&gt;&gt;&gt; t.add_columns('row','A','B','C')\n&gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n&gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n&gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n&gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n&gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n&gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n&gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n&gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n&gt;&gt;&gt; t.add_rows(\n    {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n    )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n&gt;&gt;&gt; t.add_rows( *[\n    {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n    ])                                                  # (10) list of dicts as args\n&gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def add_rows(self, *args, **kwargs):\n    \"\"\"its more efficient to add many rows at once.\n\n    if both args and kwargs, then args are added first, followed by kwargs.\n\n    supported cases:\n    ```\n    &gt;&gt;&gt; t = Table()\n    &gt;&gt;&gt; t.add_columns('row','A','B','C')\n    &gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n    &gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n    &gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n    &gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n    &gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n    &gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n    &gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n    &gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n    &gt;&gt;&gt; t.add_rows(\n        {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n        )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n    &gt;&gt;&gt; t.add_rows( *[\n        {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n        ])                                                  # (10) list of dicts as args\n    &gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n    ```\n\n    \"\"\"\n    if not BaseTable._add_row_slow_warning:\n        warnings.warn(\n            \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n        )\n        BaseTable._add_row_slow_warning = True\n\n    if args:\n        if not all(isinstance(i, (list, tuple, dict)) for i in args):  # 1,4\n            args = [args]\n\n        if all(isinstance(i, (list, tuple, dict)) for i in args):  # 2,3,7,8\n            # 1. turn the data into columns:\n\n            d = {n: [] for n in self.columns}\n            for arg in args:\n                if len(arg) != len(self.columns):\n                    raise ValueError(\n                        f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n                    )\n\n                if isinstance(arg, dict):\n                    for k, v in arg.items():  # 7,8\n                        d[k].append(v)\n\n                elif isinstance(arg, (list, tuple)):  # 2,3\n                    for n, v in zip(self.columns, arg):\n                        d[n].append(v)\n\n                else:\n                    raise TypeError(f\"{arg}?\")\n            # 2. extend the columns\n            for n, values in d.items():\n                col = self.columns[n]\n                col.extend(list_to_np_array(values))\n\n    if kwargs:\n        if isinstance(kwargs, dict):\n            if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(list_to_np_array(v))\n            else:\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(np.array([v]))\n        else:\n            raise ValueError(f\"format not recognised: {kwargs}\")\n\n    return\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.add_columns","title":"<code>tablite.core.Table.add_columns(*names)</code>","text":"<p>Adds column names to table.</p> Source code in <code>tablite/base.py</code> <pre><code>def add_columns(self, *names):\n    \"\"\"Adds column names to table.\"\"\"\n    for name in names:\n        self.columns[name] = Column(self.path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.add_column","title":"<code>tablite.core.Table.add_column(name, data=None)</code>","text":"<p>verbose alias for table[name] = data, that checks if name already exists</p> PARAMETER  DESCRIPTION <code>name</code> <p>column name</p> <p> TYPE: <code>str</code> </p> <code>data</code> <p>values. Defaults to None.</p> <p> TYPE: <code>list,tuple)</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>name isn't string</p> <code>ValueError</code> <p>name already exists</p> Source code in <code>tablite/base.py</code> <pre><code>def add_column(self, name, data=None):\n    \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n    Args:\n        name (str): column name\n        data ((list,tuple), optional): values. Defaults to None.\n\n    Raises:\n        TypeError: name isn't string\n        ValueError: name already exists\n    \"\"\"\n    if not isinstance(name, str):\n        raise TypeError(\"expected name as string\")\n    if name in self.columns:\n        raise ValueError(f\"{name} already in {self.columns}\")\n    self.__setitem__(name, data)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.stack","title":"<code>tablite.core.Table.stack(other)</code>","text":"<p>returns the joint stack of tables with overlapping column names. Example:</p> <pre><code>| Table A|  +  | Table B| = |  Table AB |\n| A| B| C|     | A| B| D|   | A| B| C| -|\n                            | A| B| -| D|\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def stack(self, other):\n    \"\"\"\n    returns the joint stack of tables with overlapping column names.\n    Example:\n    ```\n    | Table A|  +  | Table B| = |  Table AB |\n    | A| B| C|     | A| B| D|   | A| B| C| -|\n                                | A| B| -| D|\n    ```\n    \"\"\"\n    if not isinstance(other, BaseTable):\n        raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n    cp = self.copy()\n    for name, col2 in other.columns.items():\n        if name not in cp.columns:\n            cp[name] = [None] * len(self)\n        cp[name].pages.extend(col2.pages[:])\n\n    for name in self.columns:\n        if name not in other.columns:\n            if len(cp) &gt; 0:\n                cp[name].extend(np.array([None] * len(other)))\n    return cp\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.types","title":"<code>tablite.core.Table.types()</code>","text":"<p>returns nested dict of data types in the form: <code>{column name: {python type class: number of instances }, ... }</code></p> <p>example:</p> <pre><code>&gt;&gt;&gt; t.types()\n{\n    'A': {&lt;class 'str'&gt;: 7},\n    'B': {&lt;class 'int'&gt;: 7}\n}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def types(self):\n    \"\"\"\n    returns nested dict of data types in the form:\n    `{column name: {python type class: number of instances }, ... }`\n\n    example:\n    ```\n    &gt;&gt;&gt; t.types()\n    {\n        'A': {&lt;class 'str'&gt;: 7},\n        'B': {&lt;class 'int'&gt;: 7}\n    }\n    ```\n    \"\"\"\n    d = {}\n    for name, col in self.columns.items():\n        assert isinstance(col, Column)\n        d[name] = col.types()\n    return d\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.display_dict","title":"<code>tablite.core.Table.display_dict(slice_=None, blanks=None, dtype=False)</code>","text":"<p>helper for creating dict for display.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>python slice. Defaults to None.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>fill value for <code>None</code>. Defaults to None.</p> <p> TYPE: <code>optional</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>Adds datatype to each column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>slice_ must be None or slice.</p> RETURNS DESCRIPTION <code>dict</code> <p>from Table.</p> Source code in <code>tablite/base.py</code> <pre><code>def display_dict(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"helper for creating dict for display.\n\n    Args:\n        slice_ (slice, optional): python slice. Defaults to None.\n        blanks (optional): fill value for `None`. Defaults to None.\n        dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n    Raises:\n        TypeError: slice_ must be None or slice.\n\n    Returns:\n        dict: from Table.\n    \"\"\"\n    if not self.columns:\n        print(\"Empty Table\")\n        return\n\n    def datatype(col):  # PRIVATE\n        \"\"\"creates label for column datatype.\"\"\"\n        types = col.types()\n        if len(types) == 0:\n            typ = \"empty\"\n        elif len(types) == 1:\n            dt, _ = types.popitem()\n            typ = dt.__name__\n        else:\n            typ = \"mixed\"\n        return typ\n\n    row_count_tags = [\"#\", \"~\", \"*\"]\n    cols = set(self.columns)\n    for n, tag in product(range(1, 6), row_count_tags):\n        if n * tag not in cols:\n            tag = n * tag\n            break\n\n    if not isinstance(slice_, (slice, type(None))):\n        raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n    if isinstance(slice_, slice):\n        slc = slice_\n    if slice_ is None:\n        if len(self) &lt;= 20:\n            slc = slice(0, 20, 1)\n        else:\n            slc = None\n\n    n = len(self)\n    if slc:  # either we want slc or we want everything.\n        row_no = list(range(*slc.indices(len(self))))\n        data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n        for name, col in self.columns.items():\n            data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n                slc\n            ]\n    else:\n        data = {}\n        j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n        row_no = (\n            [f\"{i:,}\".rjust(j) for i in range(7)]\n            + [\"...\"]\n            + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n        )\n        data = {tag: row_no}\n\n        for name, col in self.columns.items():\n            if len(col) == n:\n                row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n            else:\n                empty = [blanks] * 7\n                head = (col[:7].tolist() + empty)[:7]\n                tail = (col[n - 7 :].tolist() + empty)[-7:]\n                row = head + [\"...\"] + tail\n            data[name] = row\n\n    if dtype:\n        for name, values in data.items():\n            if name in self.columns:\n                col = self.columns[name]\n                values.insert(0, datatype(col))\n            else:\n                values.insert(0, \"row\")\n\n    return data\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_ascii","title":"<code>tablite.core.Table.to_ascii(slice_=None, blanks=None, dtype=False)</code>","text":"<p>returns ascii view of table as string.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def to_ascii(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"returns ascii view of table as string.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n\n    def adjust(v, length):  # PRIVATE FUNCTION\n        \"\"\"whitespace justifies field values based on datatype\"\"\"\n        if v is None:\n            return str(blanks).ljust(length)\n        elif isinstance(v, str):\n            return v.ljust(length)\n        else:\n            return str(v).rjust(length)\n\n    if not self.columns:\n        return str(self)\n\n    d = {}\n    for name, values in self.display_dict(\n        slice_=slice_, blanks=blanks, dtype=dtype\n    ).items():\n        as_text = [str(v) for v in values] + [str(name)]\n        width = max(len(i) for i in as_text)\n        new_name = name.center(width, \" \")\n        if dtype:\n            values[0] = values[0].center(width, \" \")\n        d[new_name] = [adjust(v, width) for v in values]\n\n    rows = dict_to_rows(d)\n    s = []\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n    s.append(\"|\" + \"|\".join(rows[0]) + \"|\")  # column names\n    start = 1\n    if dtype:\n        s.append(\"|\" + \"|\".join(rows[1]) + \"|\")  # datatypes\n        start = 2\n\n    s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n    for row in rows[start:]:\n        s.append(\"|\" + \"|\".join(row) + \"|\")\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n    if len(set(len(c) for c in self.columns.values())) != 1:\n        warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n        s.append(warning)\n\n    return \"\\n\".join(s)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.show","title":"<code>tablite.core.Table.show(slice_=None, blanks=None, dtype=False)</code>","text":"<p>prints ascii view of table.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def show(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"prints ascii view of table.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n    print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_dict","title":"<code>tablite.core.Table.to_dict(columns=None, slice_=None)</code>","text":"<p>columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows.</p> <p>returns: dict with columns as keys and lists of values.</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 |  1|  3|\n| 1 |  2|  4|\n+===+===+===+\n&gt;&gt;&gt; t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def to_dict(self, columns=None, slice_=None):\n    \"\"\"\n    columns: list of column names. Default is None == all columns.\n    slice_: slice. Default is None == all rows.\n\n    returns: dict with columns as keys and lists of values.\n\n    Example:\n    ```\n    &gt;&gt;&gt; t.show()\n    +===+===+===+\n    | # | a | b |\n    |row|int|int|\n    +---+---+---+\n    | 0 |  1|  3|\n    | 1 |  2|  4|\n    +===+===+===+\n    &gt;&gt;&gt; t.to_dict()\n    {'a':[1,2], 'b':[3,4]}\n    ```\n\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n    assert isinstance(slice_, slice)\n\n    if columns is None:\n        columns = list(self.columns.keys())\n    if not isinstance(columns, list):\n        raise TypeError(\"expected columns as list of strings\")\n\n    return {name: list(self.columns[name][slice_]) for name in columns}\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.as_json_serializable","title":"<code>tablite.core.Table.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None)</code>","text":"<p>provides a JSON compatible format of the table.</p> PARAMETER  DESCRIPTION <code>row_count</code> <p>Label for row counts. Defaults to \"row id\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'row id'</code> </p> <code>start_on</code> <p>row counts starts by default on 1.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1</code> </p> <code>columns</code> <p>Column names. Defaults to None which returns all columns.</p> <p> TYPE: <code>list of str</code> DEFAULT: <code>None</code> </p> <code>slice_</code> <p>selector. Defaults to None which returns [:]</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <p>JSON serializable dict: All python datatypes have been converted to JSON compliant data.</p> Source code in <code>tablite/base.py</code> <pre><code>def as_json_serializable(\n    self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n    \"\"\"provides a JSON compatible format of the table.\n\n    Args:\n        row_count (str, optional): Label for row counts. Defaults to \"row id\".\n        start_on (int, optional): row counts starts by default on 1.\n        columns (list of str, optional): Column names.\n            Defaults to None which returns all columns.\n        slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n    Returns:\n        JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n\n    assert isinstance(slice_, slice)\n    new = {\"columns\": {}, \"total_rows\": len(self)}\n    if row_count is not None:\n        new[\"columns\"][row_count] = [\n            i + start_on for i in range(*slice_.indices(len(self)))\n        ]\n\n    d = self.to_dict(columns, slice_=slice_)\n    for k, data in d.items():\n        new_k = unique_name(\n            k, new[\"columns\"]\n        )  # used to avoid overwriting the `row id` key.\n        new[\"columns\"][new_k] = [\n            DataTypes.to_json(v) for v in data\n        ]  # deal with non-json datatypes.\n    return new\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.index","title":"<code>tablite.core.Table.index(*args)</code>","text":"<p>param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; table6 = Table()\n&gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n&gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A')  # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def index(self, *args):\n    \"\"\"\n    param: *args: column names\n    returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n    Examples:\n        ```\n        &gt;&gt;&gt; table6 = Table()\n        &gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n        &gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A')  # single key.\n        {('Alice',): [0],\n         ('Bob',): [1, 2],\n         ('Ben',): [3, 5],\n         ('Charlie',): [4],\n         ('Albert',): [6]})\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n        {('Alice', 'Alison'): [0],\n         ('Bob', 'Marley'): [1],\n         ('Bob', 'Dylan'): [2],\n         ('Ben', 'Affleck'): [3],\n         ('Charlie', 'Hepburn'): [4],\n         ('Ben', 'Barnes'): [5],\n         ('Albert', 'Einstein'): [6]})\n        ```\n\n    \"\"\"\n    idx = defaultdict(list)\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in enumerate(zip(*iterators)):\n        key = tuple(numpy_to_python(k) for k in key)\n        idx[key].append(ix)\n    return idx\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.unique_index","title":"<code>tablite.core.Table.unique_index(*args, tqdm=_tqdm)</code>","text":"<p>generates the index of unique rows given a list of column names</p> PARAMETER  DESCRIPTION <code>*args</code> <p>columns names</p> <p> TYPE: <code>any</code> DEFAULT: <code>()</code> </p> <code>tqdm</code> <p>Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <p>np.array(int64): indices of unique records.</p> Source code in <code>tablite/base.py</code> <pre><code>def unique_index(self, *args, tqdm=_tqdm):\n    \"\"\"generates the index of unique rows given a list of column names\n\n    Args:\n        *args (any): columns names\n        tqdm (tqdm, optional): Defaults to _tqdm.\n\n    Returns:\n        np.array(int64): indices of unique records.\n    \"\"\"\n    if not args:\n        raise ValueError(\"*args (column names) is required\")\n    seen = set()\n    unique = set()\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n        key_hash = hash(tuple(numpy_to_python(k) for k in key))\n        if key_hash in seen:\n            continue\n        else:\n            seen.add(key_hash)\n            unique.add(ix)\n    return np.array(sorted(unique))\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_file","title":"<code>tablite.core.Table.from_file(path, columns=None, first_row_has_headers=True, header_row_index=0, encoding=None, start=0, limit=sys.maxsize, sheet=None, guess_datatypes=True, newline='\\n', text_qualifier=None, delimiter=None, strip_leading_and_tailing_whitespace=True, text_escape_openings='', text_escape_closures='', skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -&gt; Table</code>  <code>classmethod</code>","text":"<pre><code>    reads path and imports 1 or more tables\n\n    REQUIRED\n    --------\n    path: pathlib.Path or str\n        selection of filereader uses path.suffix.\n        See `filereaders`.\n\n    OPTIONAL\n    --------\n    columns:\n        None: (default) All columns will be imported.\n        List: only column names from list will be imported (if present in file)\n              e.g. ['A', 'B', 'C', 'D']\n\n              datatype is detected using Datatypes.guess(...)\n              You can try it out with:\n              &gt;&gt; from tablite.datatypes import DataTypes\n              &gt;&gt; DataTypes.guess(['001','100'])\n              [1,100]\n\n              if the format cannot be achieved the read type is kept.\n        Excess column names are ignored.\n\n        HINT: To get the head of file use:\n        &gt;&gt;&gt; from tablite.tools import head\n        &gt;&gt;&gt; head = head(path)\n\n    first_row_has_headers: boolean\n        True: (default) first row is used as column names.\n        False: integers are used as column names.\n\n    encoding: str. Defaults to None (autodetect using n bytes).\n        n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n    start: the first line to be read (default: 0)\n\n    limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n    OPTIONAL FOR EXCEL AND ODS READERS\n    ----------------------------------\n\n    sheet: sheet name to import  (applicable to excel- and ods-reader only)\n        e.g. 'sheet_1'\n        sheets not found excess names are ignored.\n\n    OPTIONAL FOR TEXT READERS\n    -------------------------\n    guess_datatype: bool\n        True: (default) datatypes are guessed using DataTypes.guess(...)\n        False: all data is imported as strings.\n\n    newline: newline character (applicable to text_reader only)\n        str: '\n</code></pre> <p>' (default) or ' '</p> <pre><code>    text_qualifier: character (applicable to text_reader only)\n        None: No text qualifier is used.\n        str: \" or '\n\n    delimiter: character (applicable to text_reader only)\n        None: file suffix is used to determine field delimiter:\n            .txt: \"|\"\n            .csv: \",\",\n            .ssv: \";\"\n            .tsv: \" \" (tab)\n\n    strip_leading_and_tailing_whitespace: bool:\n        True: default\n\n    text_escape_openings: (applicable to text_reader only)\n        None: default\n        str: list of characters such as ([{\n\n    text_escape_closures: (applicable to text_reader only)\n        None: default\n        str: list of characters such as }])\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_file(\n    cls,\n    path,\n    columns=None,\n    first_row_has_headers=True,\n    header_row_index=0,\n    encoding=None,\n    start=0,\n    limit=sys.maxsize,\n    sheet=None,\n    guess_datatypes=True,\n    newline=\"\\n\",\n    text_qualifier=None,\n    delimiter=None,\n    strip_leading_and_tailing_whitespace=True,\n    text_escape_openings=\"\",\n    text_escape_closures=\"\",\n    skip_empty: ValidSkipEmpty=\"NONE\",\n    tqdm=_tqdm,\n) -&gt; \"Table\":\n    \"\"\"\n    reads path and imports 1 or more tables\n\n    REQUIRED\n    --------\n    path: pathlib.Path or str\n        selection of filereader uses path.suffix.\n        See `filereaders`.\n\n    OPTIONAL\n    --------\n    columns:\n        None: (default) All columns will be imported.\n        List: only column names from list will be imported (if present in file)\n              e.g. ['A', 'B', 'C', 'D']\n\n              datatype is detected using Datatypes.guess(...)\n              You can try it out with:\n              &gt;&gt; from tablite.datatypes import DataTypes\n              &gt;&gt; DataTypes.guess(['001','100'])\n              [1,100]\n\n              if the format cannot be achieved the read type is kept.\n        Excess column names are ignored.\n\n        HINT: To get the head of file use:\n        &gt;&gt;&gt; from tablite.tools import head\n        &gt;&gt;&gt; head = head(path)\n\n    first_row_has_headers: boolean\n        True: (default) first row is used as column names.\n        False: integers are used as column names.\n\n    encoding: str. Defaults to None (autodetect using n bytes).\n        n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n    start: the first line to be read (default: 0)\n\n    limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n    OPTIONAL FOR EXCEL AND ODS READERS\n    ----------------------------------\n\n    sheet: sheet name to import  (applicable to excel- and ods-reader only)\n        e.g. 'sheet_1'\n        sheets not found excess names are ignored.\n\n    OPTIONAL FOR TEXT READERS\n    -------------------------\n    guess_datatype: bool\n        True: (default) datatypes are guessed using DataTypes.guess(...)\n        False: all data is imported as strings.\n\n    newline: newline character (applicable to text_reader only)\n        str: '\\n' (default) or '\\r\\n'\n\n    text_qualifier: character (applicable to text_reader only)\n        None: No text qualifier is used.\n        str: \" or '\n\n    delimiter: character (applicable to text_reader only)\n        None: file suffix is used to determine field delimiter:\n            .txt: \"|\"\n            .csv: \",\",\n            .ssv: \";\"\n            .tsv: \"\\t\" (tab)\n\n    strip_leading_and_tailing_whitespace: bool:\n        True: default\n\n    text_escape_openings: (applicable to text_reader only)\n        None: default\n        str: list of characters such as ([{\n\n    text_escape_closures: (applicable to text_reader only)\n        None: default\n        str: list of characters such as }])\n\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    type_check(path, Path)\n\n    if not path.exists():\n        raise FileNotFoundError(f\"file not found: {path}\")\n\n    if not isinstance(start, int) or not 0 &lt;= start &lt;= sys.maxsize:\n        raise ValueError(f\"start {start} not in range(0,{sys.maxsize})\")\n\n    if not isinstance(limit, int) or not 0 &lt; limit &lt;= sys.maxsize:\n        raise ValueError(f\"limit {limit} not in range(0,{sys.maxsize})\")\n\n    if not isinstance(first_row_has_headers, bool):\n        raise TypeError(\"first_row_has_headers is not bool\")\n\n    import_as = path.suffix\n    if import_as.startswith(\".\"):\n        import_as = import_as[1:]\n\n    reader = import_utils.file_readers.get(import_as, None)\n    if reader is None:\n        raise ValueError(f\"{import_as} is not in supported format: {import_utils.valid_readers}\")\n\n    additional_configs = {\"tqdm\": tqdm}\n    if reader == import_utils.text_reader:\n        # here we inject tqdm, if tqdm is not provided, use generic iterator\n        # fmt:off\n        config = (path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline,\n                  guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty,\n                  delimiter, text_escape_openings, text_escape_closures)\n        # fmt:on\n\n    elif reader == import_utils.from_html:\n        config = (path,)\n    elif reader == import_utils.from_hdf5:\n        config = (path,)\n\n    elif reader == import_utils.excel_reader:\n        # config = path, first_row_has_headers, sheet, columns, start, limit\n        config = (\n            path,\n            first_row_has_headers,\n            header_row_index,\n            sheet,\n            columns,\n            skip_empty,\n            start,\n            limit,\n        )  # if file length changes - re-import.\n\n    if reader == import_utils.ods_reader:\n        # path, first_row_has_headers=True, sheet=None, columns=None, start=0, limit=sys.maxsize,\n        config = (\n            str(path),\n            first_row_has_headers,\n            header_row_index,\n            sheet,\n            columns,\n            skip_empty,\n            start,\n            limit,\n        )  # if file length changes - re-import.\n\n    # At this point the import config seems valid.\n    # Now we check if the file already has been imported.\n\n    # publish the settings\n    return reader(cls, *config, **additional_configs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_pandas","title":"<code>tablite.core.Table.from_pandas(df)</code>  <code>classmethod</code>","text":"<p>Creates Table using pd.to_dict('list')</p> <p>similar to:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n&gt;&gt;&gt; df\n    a  b\n    0  1  4\n    1  2  5\n    2  3  6\n&gt;&gt;&gt; df.to_dict('list')\n{'a': [1, 2, 3], 'b': [4, 5, 6]}\n&gt;&gt;&gt; t = Table.from_dict(df.to_dict('list))\n&gt;&gt;&gt; t.show()\n    +===+===+===+\n    | # | a | b |\n    |row|int|int|\n    +---+---+---+\n    | 0 |  1|  4|\n    | 1 |  2|  5|\n    | 2 |  3|  6|\n    +===+===+===+\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_pandas(cls, df):\n    \"\"\"\n    Creates Table using pd.to_dict('list')\n\n    similar to:\n    ```\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n    &gt;&gt;&gt; df\n        a  b\n        0  1  4\n        1  2  5\n        2  3  6\n    &gt;&gt;&gt; df.to_dict('list')\n    {'a': [1, 2, 3], 'b': [4, 5, 6]}\n    &gt;&gt;&gt; t = Table.from_dict(df.to_dict('list))\n    &gt;&gt;&gt; t.show()\n        +===+===+===+\n        | # | a | b |\n        |row|int|int|\n        +---+---+---+\n        | 0 |  1|  4|\n        | 1 |  2|  5|\n        | 2 |  3|  6|\n        +===+===+===+\n    ```\n    \"\"\"\n    return import_utils.from_pandas(cls, df)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_hdf5","title":"<code>tablite.core.Table.from_hdf5(path)</code>  <code>classmethod</code>","text":"<p>imports an exported hdf5 table.</p> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_hdf5(cls, path):\n    \"\"\"\n    imports an exported hdf5 table.\n    \"\"\"\n    return import_utils.from_hdf5(cls, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_json","title":"<code>tablite.core.Table.from_json(jsn)</code>  <code>classmethod</code>","text":"<p>Imports table exported using .to_json</p> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_json(cls, jsn):\n    \"\"\"\n    Imports table exported using .to_json\n    \"\"\"\n    return import_utils.from_json(cls, jsn)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_hdf5","title":"<code>tablite.core.Table.to_hdf5(path)</code>","text":"<p>creates a copy of the table as hdf5</p> Source code in <code>tablite/core.py</code> <pre><code>def to_hdf5(self, path):\n    \"\"\"\n    creates a copy of the table as hdf5\n    \"\"\"\n    export_utils.to_hdf5(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_pandas","title":"<code>tablite.core.Table.to_pandas()</code>","text":"<p>returns pandas.DataFrame</p> Source code in <code>tablite/core.py</code> <pre><code>def to_pandas(self):\n    \"\"\"\n    returns pandas.DataFrame\n    \"\"\"\n    return export_utils.to_pandas(self)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_sql","title":"<code>tablite.core.Table.to_sql(name)</code>","text":"<p>generates ANSI-92 compliant SQL.</p> Source code in <code>tablite/core.py</code> <pre><code>def to_sql(self, name):\n    \"\"\"\n    generates ANSI-92 compliant SQL.\n    \"\"\"\n    return export_utils.to_sql(self, name)  # remove after update to test suite.\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_json","title":"<code>tablite.core.Table.to_json()</code>","text":"<p>returns JSON</p> Source code in <code>tablite/core.py</code> <pre><code>def to_json(self):\n    \"\"\"\n    returns JSON\n    \"\"\"\n    return export_utils.to_json(self)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_xlsx","title":"<code>tablite.core.Table.to_xlsx(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_xlsx(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".xlsx\")\n    export_utils.excel_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_ods","title":"<code>tablite.core.Table.to_ods(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_ods(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".ods\")\n    export_utils.excel_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_csv","title":"<code>tablite.core.Table.to_csv(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_csv(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".csv\")\n    export_utils.text_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_tsv","title":"<code>tablite.core.Table.to_tsv(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_tsv(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".tsv\")\n    export_utils.text_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_text","title":"<code>tablite.core.Table.to_text(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_text(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".txt\")\n    export_utils.text_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_html","title":"<code>tablite.core.Table.to_html(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_html(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".html\")\n    export_utils.to_html(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.expression","title":"<code>tablite.core.Table.expression(expression)</code>","text":"<p>filters based on an expression, such as:</p> <pre><code>\"all((A==B, C!=4, 200&lt;D))\"\n</code></pre> <p>which is interpreted using python's compiler to:</p> <pre><code>def _f(A,B,C,D):\n    return all((A==B, C!=4, 200&lt;D))\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def expression(self, expression):\n    \"\"\"\n    filters based on an expression, such as:\n\n        \"all((A==B, C!=4, 200&lt;D))\"\n\n    which is interpreted using python's compiler to:\n\n        def _f(A,B,C,D):\n            return all((A==B, C!=4, 200&lt;D))\n    \"\"\"\n    return redux._filter_using_expression(self, expression)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.filter","title":"<code>tablite.core.Table.filter(expressions, filter_type='all', tqdm=_tqdm)</code>","text":"<p>enables filtering across columns for multiple criteria.</p> <p>expressions:</p> <pre><code>str: Expression that can be compiled and executed row by row.\n    exampLe: \"all((A==B and C!=4 and 200&lt;D))\"\n\nlist of dicts: (example):\n\n    L = [\n        {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n        {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n        {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n    ]\n\naccepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n</code></pre> <p>filter_type: 'all' or 'any'</p> Source code in <code>tablite/core.py</code> <pre><code>def filter(self, expressions, filter_type=\"all\", tqdm=_tqdm):\n    \"\"\"\n    enables filtering across columns for multiple criteria.\n\n    expressions:\n\n        str: Expression that can be compiled and executed row by row.\n            exampLe: \"all((A==B and C!=4 and 200&lt;D))\"\n\n        list of dicts: (example):\n\n            L = [\n                {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n                {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n                {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n            ]\n\n        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n    filter_type: 'all' or 'any'\n    \"\"\"\n    return redux.filter(self, expressions, filter_type, tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.sort_index","title":"<code>tablite.core.Table.sort_index(sort_mode='excel', tqdm=_tqdm, pbar=None, **kwargs)</code>","text":"<p>helper for methods <code>sort</code> and <code>is_sorted</code></p> <p>param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort()</p> Source code in <code>tablite/core.py</code> <pre><code>def sort_index(self, sort_mode=\"excel\", tqdm=_tqdm, pbar=None, **kwargs):\n    \"\"\"\n    helper for methods `sort` and `is_sorted`\n\n    param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n    param: **kwargs: sort criteria. See Table.sort()\n    \"\"\"\n    return sortation.sort_index(self, sort_mode, tqdm=tqdm, pbar=pbar, **kwargs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.reindex","title":"<code>tablite.core.Table.reindex(index)</code>","text":"<p>index: list of integers that declare sort order.</p> <p>Examples:</p> <pre><code>Table:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def reindex(self, index):\n    \"\"\"\n    index: list of integers that declare sort order.\n\n    Examples:\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6]\n        result: ['b','d','f','h']\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6,1,3,5,7]\n        result: ['a','c','e','g','b','d','f','h']\n\n    \"\"\"\n    if isinstance(index, list):\n        index = np.array(index)\n    return _reindex.reindex(self, index)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.drop_duplicates","title":"<code>tablite.core.Table.drop_duplicates(*args)</code>","text":"<p>removes duplicate rows based on column names</p> <p>args: (optional) column_names if no args, all columns are used.</p> Source code in <code>tablite/core.py</code> <pre><code>def drop_duplicates(self, *args):\n    \"\"\"\n    removes duplicate rows based on column names\n\n    args: (optional) column_names\n    if no args, all columns are used.\n    \"\"\"\n    if not args:\n        args = self.columns\n    index = self.unique_index(*args)\n    return self.reindex(index)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.sort","title":"<code>tablite.core.Table.sort(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None)</code>","text":"<p>Perform multi-pass sorting with precedence given order of column names.</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>keys as columns,             values as boolean for 'reverse'</p> <p> TYPE: <code>dict</code> </p> <code>sort_mode</code> <p>str: \"alphanumeric\", \"unix\", or, \"excel\"</p> <p> DEFAULT: <code>'excel'</code> </p> RETURNS DESCRIPTION <code>None</code> <p>Table.sort is sorted inplace</p> <p>Examples: Table.sort(mappinp={A':False}) means sort by 'A' in ascending order. Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority) sort B in ascending order.</p> Source code in <code>tablite/core.py</code> <pre><code>def sort(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n    \"\"\"Perform multi-pass sorting with precedence given order of column names.\n\n    Args:\n        mapping (dict): keys as columns,\n                        values as boolean for 'reverse'\n        sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n\n    Returns:\n        None: Table.sort is sorted inplace\n\n    Examples:\n    Table.sort(mappinp={A':False}) means sort by 'A' in ascending order.\n    Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority)\n    sort B in ascending order.\n    \"\"\"\n    new = sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n    self.columns = new.columns\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.sorted","title":"<code>tablite.core.Table.sorted(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None)</code>","text":"<p>See sort. Sorted returns a new table in contrast to \"sort\", which is in-place.</p> RETURNS DESCRIPTION <p>Table.</p> Source code in <code>tablite/core.py</code> <pre><code>def sorted(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n    \"\"\"See sort.\n    Sorted returns a new table in contrast to \"sort\", which is in-place.\n\n    Returns:\n        Table.\n    \"\"\"\n    return sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.is_sorted","title":"<code>tablite.core.Table.is_sorted(mapping, sort_mode='excel')</code>","text":"<p>Performs multi-pass sorting check with precedence given order of column names. **kwargs: optional: sort criteria. See Table.sort() :return bool</p> Source code in <code>tablite/core.py</code> <pre><code>def is_sorted(self, mapping, sort_mode=\"excel\"):\n    \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n    **kwargs: optional: sort criteria. See Table.sort()\n    :return bool\n    \"\"\"\n    return sortation.is_sorted(self, mapping, sort_mode)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.any","title":"<code>tablite.core.Table.any(**kwargs)</code>","text":"<p>returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> Source code in <code>tablite/core.py</code> <pre><code>def any(self, **kwargs):\n    \"\"\"\n    returns Table for rows where ANY kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n    \"\"\"\n    return redux.filter_any(self, **kwargs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.all","title":"<code>tablite.core.Table.all(**kwargs)</code>","text":"<p>returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> <p>Examples:</p> <pre><code>t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n    return x == 4\ndef g(x):\n    return x &lt; 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n    return x&gt;=2\n\ndef i(x):\n    return x&lt;=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def all(self, **kwargs):\n    \"\"\"\n    returns Table for rows where ALL kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n\n    Examples:\n\n        t = Table()\n        t['a'] = [1,2,3,4]\n        t['b'] = [10,20,30,40]\n\n        def f(x):\n            return x == 4\n        def g(x):\n            return x &lt; 20\n\n        t2 = t.any( **{\"a\":f, \"b\":g})\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        t2 = t.any(a=f,b=g)\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        def h(x):\n            return x&gt;=2\n\n        def i(x):\n            return x&lt;=30\n\n        t2 = t.all(a=h,b=i)\n        assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n    \"\"\"\n    return redux.filter_all(self, **kwargs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.drop","title":"<code>tablite.core.Table.drop(*args)</code>","text":"<p>removes all rows where args are present.</p> <p>Exmaple:</p> <p>t = Table() t['A'] = [1,2,3,None] t['B'] = [None,2,3,4] t2 = t.drop(None) t2'A', t2'B' ([2,3], [2,3])</p> Source code in <code>tablite/core.py</code> <pre><code>def drop(self, *args):\n    \"\"\"\n    removes all rows where args are present.\n\n    Exmaple:\n    &gt;&gt;&gt; t = Table()\n    &gt;&gt;&gt; t['A'] = [1,2,3,None]\n    &gt;&gt;&gt; t['B'] = [None,2,3,4]\n    &gt;&gt;&gt; t2 = t.drop(None)\n    &gt;&gt;&gt; t2['A'][:], t2['B'][:]\n    ([2,3], [2,3])\n\n    \"\"\"\n    if not args:\n        raise ValueError(\"What to drop? None? np.nan? \")\n    return redux.drop(self, *args)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.replace","title":"<code>tablite.core.Table.replace(mapping, columns=None, tqdm=_tqdm, pbar=None)</code>","text":"<p>replaces all mapped keys with values from named columns</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>keys are targets for replacement,             values are replacements.</p> <p> TYPE: <code>dict</code> </p> <code>columns</code> <p>target columns. Defaults to None (all columns)</p> <p> TYPE: <code>list or str</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>ValueError</code> <p>description</p> Source code in <code>tablite/core.py</code> <pre><code>def replace(self, mapping, columns=None, tqdm=_tqdm, pbar=None):\n    \"\"\"replaces all mapped keys with values from named columns\n\n    Args:\n        mapping (dict): keys are targets for replacement,\n                        values are replacements.\n        columns (list or str, optional): target columns.\n            Defaults to None (all columns)\n\n    Raises:\n        ValueError: _description_\n    \"\"\"\n    if columns is None:\n        columns = list(self.columns)\n    if not isinstance(columns, list) and columns in self.columns:\n        columns = [columns]\n    type_check(columns, list)\n    for n in columns:\n        if n not in self.columns:\n            raise ValueError(f\"column not found: {n}\")\n\n    if pbar is None:\n        total = len(columns)\n        pbar = tqdm(total=total, desc=\"replace\", disable=Config.TQDM_DISABLE)\n\n    for name in columns:\n        col = self.columns[name]\n        col.replace(mapping)\n        pbar.update(1)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.groupby","title":"<code>tablite.core.Table.groupby(keys, functions, tqdm=_tqdm, pbar=None)</code>","text":"<p>keys: column names for grouping. functions: [optional] list of column names and group functions (See GroupyBy class) returns: table</p> <p>Example:</p> <pre><code>t = Table()\nt.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\nt.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\nt.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\nt.show()\n+=====+=====+=====+\n|  A  |  B  |  C  |\n| int | int | int |\n+-----+-----+-----+\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n+=====+=====+=====+\n\ng = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\ng.show()\n+===+===+===+======+\n| # | A | C |Sum(B)|\n|row|int|int| int  |\n+---+---+---+------+\n|0  |  1|  6|     2|\n|1  |  1|  5|     4|\n|2  |  2|  4|     6|\n|3  |  2|  3|     8|\n|4  |  3|  2|    10|\n|5  |  3|  1|    12|\n+===+===+===+======+\n</code></pre> <p>Cheat sheet:</p> <p>list of unique values</p> <pre><code>&gt;&gt;&gt; g1 = t.groupby(keys=['A'], functions=[])\n&gt;&gt;&gt; g1['A'][:]\n[1,2,3]\n</code></pre> <p>alternatively:</p> <p>t['A'].unique() [1,2,3]</p> <p>list of unique values, grouped by longest combination.</p> <pre><code>&gt;&gt;&gt; g2 = t.groupby(keys=['A', 'B'], functions=[])\n&gt;&gt;&gt; g2['A'][:], g2['B'][:]\n([1,1,2,2,3,3], [1,2,3,4,5,6])\n</code></pre> <p>alternatively:</p> <pre><code>&gt;&gt;&gt; list(zip(*t.index('A', 'B').keys()))\n[(1,1,2,2,3,3) (1,2,3,4,5,6)]\n</code></pre> <p>A key (unique values) and count hereof.</p> <pre><code>&gt;&gt;&gt; g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n&gt;&gt;&gt; g3['A'][:], g3['Count(A)'][:]\n([1,2,3], [4,4,4])\n</code></pre> <p>alternatively:</p> <pre><code>&gt;&gt;&gt; t['A'].histogram()\n([1,2,3], [4,4,4])\n</code></pre> <p>for more exmaples see:     https://github.com/root-11/tablite/blob/master/tests/test_groupby.py</p> Source code in <code>tablite/core.py</code> <pre><code>def groupby(self, keys, functions, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    keys: column names for grouping.\n    functions: [optional] list of column names and group functions (See GroupyBy class)\n    returns: table\n\n    Example:\n    ```\n    t = Table()\n    t.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\n    t.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\n    t.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\n    t.show()\n    +=====+=====+=====+\n    |  A  |  B  |  C  |\n    | int | int | int |\n    +-----+-----+-----+\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    +=====+=====+=====+\n\n    g = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\n    g.show()\n    +===+===+===+======+\n    | # | A | C |Sum(B)|\n    |row|int|int| int  |\n    +---+---+---+------+\n    |0  |  1|  6|     2|\n    |1  |  1|  5|     4|\n    |2  |  2|  4|     6|\n    |3  |  2|  3|     8|\n    |4  |  3|  2|    10|\n    |5  |  3|  1|    12|\n    +===+===+===+======+\n    ```\n    Cheat sheet:\n\n    list of unique values\n    ```\n    &gt;&gt;&gt; g1 = t.groupby(keys=['A'], functions=[])\n    &gt;&gt;&gt; g1['A'][:]\n    [1,2,3]\n    ```\n    alternatively:\n    &gt;&gt;&gt; t['A'].unique()\n    [1,2,3]\n\n    list of unique values, grouped by longest combination.\n    ```\n    &gt;&gt;&gt; g2 = t.groupby(keys=['A', 'B'], functions=[])\n    &gt;&gt;&gt; g2['A'][:], g2['B'][:]\n    ([1,1,2,2,3,3], [1,2,3,4,5,6])\n    ```\n    alternatively:\n    ```\n    &gt;&gt;&gt; list(zip(*t.index('A', 'B').keys()))\n    [(1,1,2,2,3,3) (1,2,3,4,5,6)]\n    ```\n    A key (unique values) and count hereof.\n    ```\n    &gt;&gt;&gt; g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n    &gt;&gt;&gt; g3['A'][:], g3['Count(A)'][:]\n    ([1,2,3], [4,4,4])\n    ```\n    alternatively:\n    ```\n    &gt;&gt;&gt; t['A'].histogram()\n    ([1,2,3], [4,4,4])\n    ```\n    for more exmaples see:\n        https://github.com/root-11/tablite/blob/master/tests/test_groupby.py\n\n    \"\"\"\n    return _groupby(self, keys, functions, tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.pivot","title":"<code>tablite.core.Table.pivot(rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None)</code>","text":"<p>param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as</p> <p>example:</p> <pre><code>t.show()\n+=====+=====+=====+\n|  A  |  B  |  C  |\n| int | int | int |\n+-----+-----+-----+\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n+=====+=====+=====+\n\nt2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\nt2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int|  str   |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0  |  6|Sum(B)  |    2|None |None |\n|1  |  5|Sum(B)  |    4|None |None |\n|2  |  4|Sum(B)  |None |    6|None |\n|3  |  3|Sum(B)  |None |    8|None |\n|4  |  2|Sum(B)  |None |None |   10|\n|5  |  1|Sum(B)  |None |None |   12|\n+===+===+========+=====+=====+=====+\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def pivot(self, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    param: rows: column names to keep as rows\n    param: columns: column names to keep as columns\n    param: functions: aggregation functions from the Groupby class as\n\n    example:\n    ```\n    t.show()\n    +=====+=====+=====+\n    |  A  |  B  |  C  |\n    | int | int | int |\n    +-----+-----+-----+\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    +=====+=====+=====+\n\n    t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n    t2.show()\n    +===+===+========+=====+=====+=====+\n    | # | C |function|(A=1)|(A=2)|(A=3)|\n    |row|int|  str   |mixed|mixed|mixed|\n    +---+---+--------+-----+-----+-----+\n    |0  |  6|Sum(B)  |    2|None |None |\n    |1  |  5|Sum(B)  |    4|None |None |\n    |2  |  4|Sum(B)  |None |    6|None |\n    |3  |  3|Sum(B)  |None |    8|None |\n    |4  |  2|Sum(B)  |None |None |   10|\n    |5  |  1|Sum(B)  |None |None |   12|\n    +===+===+========+=====+=====+=====+\n    ```\n    \"\"\"\n    return pivots.pivot(self, rows, columns, functions, values_as_rows, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.merge","title":"<code>tablite.core.Table.merge(left, right, new, criteria)</code>","text":"<p>takes from LEFT where criteria is True else RIGHT. :param: T: Table :param: criteria: np.array(bool):          if True take left column         else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name</p> <p>:returns: T</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; c.show()\n+==+====+====+====+====+\n| #| A  | B  | C  | D  |\n+--+----+----+----+----+\n| 0|   1|  10|   1|  11|\n| 1|   2|  20|   2|  12|\n| 2|   3|None|   3|  13|\n| 3|None|  40|None|None|\n| 4|   5|  50|None|None|\n| 5|None|None|   6|  16|\n| 6|None|None|   7|  17|\n+==+====+====+====+====+\n\n&gt;&gt;&gt; c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n&gt;&gt;&gt; c.show()\n+==+====+====+====+\n| #| B  | D  | E  |\n+--+----+----+----+\n| 0|  10|  11|   1|\n| 1|  20|  12|   2|\n| 2|None|  13|   3|\n| 3|  40|None|None|\n| 4|  50|None|   5|\n| 5|None|  16|   6|\n| 6|None|  17|   7|\n+==+====+====+====+\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def merge(self, left, right, new, criteria):\n    \"\"\" takes from LEFT where criteria is True else RIGHT.\n    :param: T: Table\n    :param: criteria: np.array(bool): \n            if True take left column\n            else take right column\n    :param left: (str) column name\n    :param right: (str) column name\n    :param new: (str) new name\n\n    :returns: T\n\n    Example:\n    ```\n    &gt;&gt;&gt; c.show()\n    +==+====+====+====+====+\n    | #| A  | B  | C  | D  |\n    +--+----+----+----+----+\n    | 0|   1|  10|   1|  11|\n    | 1|   2|  20|   2|  12|\n    | 2|   3|None|   3|  13|\n    | 3|None|  40|None|None|\n    | 4|   5|  50|None|None|\n    | 5|None|None|   6|  16|\n    | 6|None|None|   7|  17|\n    +==+====+====+====+====+\n\n    &gt;&gt;&gt; c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n    &gt;&gt;&gt; c.show()\n    +==+====+====+====+\n    | #| B  | D  | E  |\n    +--+----+----+----+\n    | 0|  10|  11|   1|\n    | 1|  20|  12|   2|\n    | 2|None|  13|   3|\n    | 3|  40|None|None|\n    | 4|  50|None|   5|\n    | 5|None|  16|   6|\n    | 6|None|  17|   7|\n    +==+====+====+====+\n    ```\n    \"\"\"\n    return merge.where(self, criteria,left,right,new)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.column_select","title":"<code>tablite.core.Table.column_select(cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager)</code>","text":"<p>type-casts columns from a given table to specified type(s)</p> cols <p>list of dicts: (example):</p> <pre><code>cols = [\n    {'column':'A', 'type': 'bool'},\n    {'column':'B', 'type': 'int', 'allow_empty': True},\n    {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n]\n</code></pre> <p>'column'     : column name of the input table that we want to type-cast 'type'       : type that we want to type-cast the specified column to 'allow_empty': should we allow empty values (None, str('')) through (Default: False) 'rename'     : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None)</p> <p>supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime'</p> <p>if any of the columns is rejected, entire row is rejected</p> <p>tqdm: progressbar constructor TaskManager: TaskManager constructor</p> (TABLE, TABLE) DESCRIPTION <p>first table contains the rows that were successfully cast to desired types</p> <p>second table contains rows that failed to cast + rejection reason</p> Source code in <code>tablite/core.py</code> <pre><code>def column_select(self, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager):\n    \"\"\"\n    type-casts columns from a given table to specified type(s)\n\n    cols:\n        list of dicts: (example):\n\n            cols = [\n                {'column':'A', 'type': 'bool'},\n                {'column':'B', 'type': 'int', 'allow_empty': True},\n                {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n            ]\n\n        'column'     : column name of the input table that we want to type-cast\n        'type'       : type that we want to type-cast the specified column to\n        'allow_empty': should we allow empty values (None, str('')) through (Default: False)\n        'rename'     : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None)\n\n        supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime'\n\n        if any of the columns is rejected, entire row is rejected\n\n    tqdm: progressbar constructor\n    TaskManager: TaskManager constructor\n\n    returns: (Table, Table)\n        first table contains the rows that were successfully cast to desired types\n        second table contains rows that failed to cast + rejection reason\n    \"\"\"\n    return _column_select(self, cols, tqdm, TaskManager)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.join","title":"<code>tablite.core.Table.join(other, left_keys, right_keys, left_columns=None, right_columns=None, kind='inner', merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>short-cut for all join functions. kind: 'inner', 'left', 'outer', 'cross'</p> Source code in <code>tablite/core.py</code> <pre><code>def join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, kind=\"inner\", merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    short-cut for all join functions.\n    kind: 'inner', 'left', 'outer', 'cross'\n    \"\"\"\n    kinds = {\n        \"inner\": self.inner_join,\n        \"left\": self.left_join,\n        \"outer\": self.outer_join,\n        \"cross\": self.cross_join,\n    }\n    if kind not in kinds:\n        raise ValueError(f\"join type unknown: {kind}\")\n    f = kinds.get(kind, None)\n    return f(other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.left_join","title":"<code>tablite.core.Table.left_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>:param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example:</p> <pre><code>SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nTablite: left_join = numbers.left_join(\n    letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n)\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def left_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    :param other: self, other = (left, right)\n    :param left_keys: list of keys for the join\n    :param right_keys: list of keys for the join\n    :param left_columns: list of left columns to retain, if None, all are retained.\n    :param right_columns: list of right columns to retain, if None, all are retained.\n    :return: new Table\n    Example:\n    ```\n    SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n    Tablite: left_join = numbers.left_join(\n        letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n    )\n    ```\n    \"\"\"\n    return joins.left_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.inner_join","title":"<code>tablite.core.Table.inner_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>:param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example:</p> <pre><code>SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\nTablite: inner_join = numbers.inner_join(\n    letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n    )\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def inner_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    :param other: self, other = (left, right)\n    :param left_keys: list of keys for the join\n    :param right_keys: list of keys for the join\n    :param left_columns: list of left columns to retain, if None, all are retained.\n    :param right_columns: list of right columns to retain, if None, all are retained.\n    :return: new Table\n    Example:\n    ```\n    SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n    Tablite: inner_join = numbers.inner_join(\n        letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n        )\n    ```\n    \"\"\"\n    return joins.inner_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.outer_join","title":"<code>tablite.core.Table.outer_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>:param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example:</p> <pre><code>SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nTablite: outer_join = numbers.outer_join(\n    letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n    )\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def outer_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    :param other: self, other = (left, right)\n    :param left_keys: list of keys for the join\n    :param right_keys: list of keys for the join\n    :param left_columns: list of left columns to retain, if None, all are retained.\n    :param right_columns: list of right columns to retain, if None, all are retained.\n    :return: new Table\n    Example:\n    ```\n    SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n    Tablite: outer_join = numbers.outer_join(\n        letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n        )\n    ```\n    \"\"\"\n    return joins.outer_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.cross_join","title":"<code>tablite.core.Table.cross_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table</p> Source code in <code>tablite/core.py</code> <pre><code>def cross_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    CROSS JOIN returns the Cartesian product of rows from tables in the join.\n    In other words, it will produce rows which combine each row from the first table\n    with each row from the second table\n    \"\"\"\n    return joins.cross_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.lookup","title":"<code>tablite.core.Table.lookup(other, *criteria, all=True, tqdm=_tqdm)</code>","text":"<p>function for looking up values in <code>other</code> according to criteria in ascending order. :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form:     (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=Any</p> <p>OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare.</p> <p>Examples:</p> <pre><code>('column A', \"==\", 'column B')  # comparison of two columns\n('Date', \"&lt;\", DataTypes.date(24,12) )  # value from column 'Date' is before 24/12.\nf = lambda L,R: all( ord(L) &lt; ord(R) )  # uses custom function.\n('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def lookup(self, other, *criteria, all=True, tqdm=_tqdm):\n    \"\"\"function for looking up values in `other` according to criteria in ascending order.\n    :param: other: Table sorted in ascending search order.\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n        (LEFT, OPERATOR, RIGHT)\n    :param: all: boolean: True=ALL, False=Any\n\n    OPERATOR must be a callable that returns a boolean\n    LEFT must be a value that the OPERATOR can compare.\n    RIGHT must be a value that the OPERATOR can compare.\n\n    Examples:\n    ```\n    ('column A', \"==\", 'column B')  # comparison of two columns\n    ('Date', \"&lt;\", DataTypes.date(24,12) )  # value from column 'Date' is before 24/12.\n    f = lambda L,R: all( ord(L) &lt; ord(R) )  # uses custom function.\n    ('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n    ```\n    \"\"\"\n    return lookup.lookup(self, other, *criteria, all=all, tqdm=tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.match","title":"<code>tablite.core.Table.match(other, *criteria, keep_left=None, keep_right=None)</code>","text":"<p>performs inner join where <code>T</code> matches <code>other</code> and removes rows that do not match.</p> <p>:param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form:</p> <pre><code>(LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n    ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n</code></pre> <p>:param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep.</p> Source code in <code>tablite/core.py</code> <pre><code>def match(self, other, *criteria, keep_left=None, keep_right=None):\n    \"\"\"\n    performs inner join where `T` matches `other` and removes rows that do not match.\n\n    :param: T: Table\n    :param: other: Table\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n        (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n        Example:\n            ('column A', \"==\", 'column B')\n\n        This syntax follows the lookup syntax. See Lookup for details.\n\n    :param: keep_left: list of columns to keep.\n    :param: keep_right: list of right columns to keep.\n    \"\"\"\n    return match.match(self, other, *criteria, keep_left=keep_left, keep_right=keep_right)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.replace_missing_values","title":"<code>tablite.core.Table.replace_missing_values(*args, **kwargs)</code>","text":"Source code in <code>tablite/core.py</code> <pre><code>def replace_missing_values(self, *args, **kwargs):\n    raise AttributeError(\"See imputation\")\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.imputation","title":"<code>tablite.core.Table.imputation(targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm)</code>","text":"<p>In statistics, imputation is the process of replacing missing data with substituted values.</p> <p>See more: https://en.wikipedia.org/wiki/Imputation_(statistics)</p> PARAMETER  DESCRIPTION <code>table</code> <p>source table.</p> <p> TYPE: <code>Table</code> </p> <code>targets</code> <p>column names to find and replace missing values</p> <p> TYPE: <code>str or list of strings</code> </p> <code>missing</code> <p>values to be replaced.</p> <p> TYPE: <code>None or iterable</code> DEFAULT: <code>None</code> </p> <code>method</code> <p>method to be used for replacement. Options:</p> <p>'carry forward':     takes the previous value, and carries forward into fields     where values are missing.     +: quick. Realistic on time series.     -: Can produce strange outliers.</p> <p>'mean':     calculates the column mean (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: doesn't work on text. Causes data set to drift towards the mean.</p> <p>'mode':     calculates the column mode (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: most frequent value becomes over-represented in the sample</p> <p>'nearest neighbour':     calculates normalised distance between items in source columns     selects nearest neighbour and copies value as replacement.     +: works for any datatype.     -: computationally intensive (e.g. slow)</p> <p> TYPE: <code>str</code> DEFAULT: <code>'carry forward'</code> </p> <code>sources</code> <p>NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used.</p> <p> TYPE: <code>list of strings</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>table</code> <p>table with replaced values.</p> Source code in <code>tablite/core.py</code> <pre><code>def imputation(self, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm):\n    \"\"\"\n    In statistics, imputation is the process of replacing missing data with substituted values.\n\n    See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n    Args:\n        table (Table): source table.\n\n        targets (str or list of strings): column names to find and\n            replace missing values\n\n        missing (None or iterable): values to be replaced.\n\n        method (str): method to be used for replacement. Options:\n\n            'carry forward':\n                takes the previous value, and carries forward into fields\n                where values are missing.\n                +: quick. Realistic on time series.\n                -: Can produce strange outliers.\n\n            'mean':\n                calculates the column mean (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: doesn't work on text. Causes data set to drift towards the mean.\n\n            'mode':\n                calculates the column mode (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: most frequent value becomes over-represented in the sample\n\n            'nearest neighbour':\n                calculates normalised distance between items in source columns\n                selects nearest neighbour and copies value as replacement.\n                +: works for any datatype.\n                -: computationally intensive (e.g. slow)\n\n        sources (list of strings): NEAREST NEIGHBOUR ONLY\n            column names to be used during imputation.\n            if None or empty, all columns will be used.\n\n    Returns:\n        table: table with replaced values.\n    \"\"\"\n    return imputation.imputation(self, targets, missing, method, sources, tqdm=tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.transpose","title":"<code>tablite.core.Table.transpose(tqdm=_tqdm)</code>","text":"Source code in <code>tablite/core.py</code> <pre><code>def transpose(self, tqdm=_tqdm):\n    return pivots.transpose(self, tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.pivot_transpose","title":"<code>tablite.core.Table.pivot_transpose(columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm)</code>","text":"<p>Transpose a selection of columns to rows.</p> PARAMETER  DESCRIPTION <code>columns</code> <p>column names to transpose</p> <p> TYPE: <code>list of column names</code> </p> <code>keep</code> <p>column names to keep (repeat)</p> <p> TYPE: <code>list of column names</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>with columns transposed to rows</p> Example <p>transpose columns 1,2 and 3 and transpose the remaining columns, except <code>sum</code>.</p> <p>Input:</p> <pre><code>| col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n| 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n| ...  |      |      |     |     |     |     |     |      |\n\nt.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun      |   456|\n|1234| 2345| 3456| mon      |   567|\n|1244| 2445| 4456| mon      |     7|\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def pivot_transpose(self, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n    \"\"\"Transpose a selection of columns to rows.\n\n    Args:\n        columns (list of column names): column names to transpose\n        keep (list of column names): column names to keep (repeat)\n\n    Returns:\n        Table: with columns transposed to rows\n\n    Example:\n        transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n    Input:\n    ```\n    | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n    |------|------|------|-----|-----|-----|-----|-----|------|\n    | 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n    | 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n    | ...  |      |      |     |     |     |     |     |      |\n\n    t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n    Output:\n\n    |col1| col2| col3| transpose| value|\n    |----|-----|-----|----------|------|\n    |1234| 2345| 3456| sun      |   456|\n    |1234| 2345| 3456| mon      |   567|\n    |1244| 2445| 4456| mon      |     7|\n    ```\n    \"\"\"\n    return pivots.pivot_transpose(self, columns, keep, column_name, value_name, tqdm=tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.diff","title":"<code>tablite.core.Table.diff(other, columns=None)</code>","text":"<p>compares table self with table other</p> PARAMETER  DESCRIPTION <code>self</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>other</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>columns</code> <p>list of column names to include in comparison. Defaults to None.</p> <p> TYPE: <code>List</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>diff of self and other with diff in columns 1st and 2nd.</p> Source code in <code>tablite/core.py</code> <pre><code>def diff(self, other, columns=None):\n    \"\"\"compares table self with table other\n\n    Args:\n        self (Table): Table\n        other (Table): Table\n        columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n    Returns:\n        Table: diff of self and other with diff in columns 1st and 2nd.\n    \"\"\"\n    return diff.diff(self, other, columns)\n</code></pre>"},{"location":"reference/core/#tablite.core-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core-modules","title":"Modules","text":""},{"location":"reference/datasets/","title":"Datasets","text":""},{"location":"reference/datasets/#tablite.datasets","title":"<code>tablite.datasets</code>","text":""},{"location":"reference/datasets/#tablite.datasets-classes","title":"Classes","text":""},{"location":"reference/datasets/#tablite.datasets-functions","title":"Functions","text":""},{"location":"reference/datasets/#tablite.datasets.synthetic_order_data","title":"<code>tablite.datasets.synthetic_order_data(rows=100000)</code>","text":"<p>Creates a synthetic dataset for testing that looks like this: (depending on number of rows)</p> <pre><code>+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n|    ~    |   #   |      1      |         2         |  3  | 4 |  5  | 6  | 7 |  8  |  9  |         10        |        11        |\n|   row   |  int  |     int     |      datetime     | int |int| int |str |str|mixed|mixed|       float       |      float       |\n+---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n|0        |      1|1478158906743|2021-10-27 00:00:00|50764|  1|29990|C4-5|APP|21\u00b0  |None | 2.0434376837650046|1.3371665497020444|\n|1        |      2|2271295805011|2021-09-13 00:00:00|50141|  0|10212|C4-5|TAE|None |None |  1.010318612835485| 20.94821610676901|\n|2        |      3|1598726492913|2021-08-19 00:00:00|50527|  0|19416|C3-5|QPV|21\u00b0  |None |  1.463459515469516|  17.4133659842749|\n|3        |      4|1413615572689|2021-11-05 00:00:00|50181|  1|18637|C4-2|GCL|6\u00b0   |ABC  |  2.084002469706324| 0.489481411683505|\n|4        |      5| 245266998048|2021-09-25 00:00:00|50378|  0|29756|C5-4|LGY|6\u00b0   |XYZ  | 0.5141579343276079| 8.550780816571438|\n|5        |      6| 947994853644|2021-10-14 00:00:00|50511|  0| 7890|C2-4|BET|0\u00b0   |XYZ  | 1.1725893606177542| 7.447314130260951|\n|6        |      7|2230693047809|2021-10-07 00:00:00|50987|  1|26742|C1-3|CFP|0\u00b0   |XYZ  | 1.0921267279498004|11.009210185311993|\n|...      |...    |...          |...                |...  |...|...  |... |...|...  |...  |...                |...               |\n|7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883|  1|15687|C3-1|RFR|None |XYZ  | 1.3467185981566827|17.023443485654845|\n|7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152|  0|16556|C4-2|WTC|None |ABC  | 1.1517593924478968| 8.201818634721487|\n|7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008|  1|14590|C1-3|WYM|0\u00b0   |None | 2.1273836233717978|23.295943554889195|\n|7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173|  1|20744|C2-1|ZYO|6\u00b0   |ABC  |  2.482509134693724| 22.25375464857266|\n|7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052|  1|28298|C5-4|XAW|None |XYZ  |0.17923757926558143|23.728160892974252|\n|7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993|  1|24832|C5-2|UDL|None |ABC  |0.08425329763360942|12.707735293126758|\n|7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510|  0|15819|C3-4|IGY|None |ABC  |  1.066241687256579|13.862069804070295|\n+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n</code></pre> PARAMETER  DESCRIPTION <code>rows</code> <p>number of rows wanted. Defaults to 100_000.</p> <p> TYPE: <code>int</code> DEFAULT: <code>100000</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>Populated table.</p> <p> TYPE: <code>Table</code> </p> Source code in <code>tablite/datasets.py</code> <pre><code>def synthetic_order_data(rows=100_000):\n    \"\"\"Creates a synthetic dataset for testing that looks like this:\n    (depending on number of rows)\n\n    ```\n    +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n    |    ~    |   #   |      1      |         2         |  3  | 4 |  5  | 6  | 7 |  8  |  9  |         10        |        11        |\n    |   row   |  int  |     int     |      datetime     | int |int| int |str |str|mixed|mixed|       float       |      float       |\n    +---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n    |0        |      1|1478158906743|2021-10-27 00:00:00|50764|  1|29990|C4-5|APP|21\u00b0  |None | 2.0434376837650046|1.3371665497020444|\n    |1        |      2|2271295805011|2021-09-13 00:00:00|50141|  0|10212|C4-5|TAE|None |None |  1.010318612835485| 20.94821610676901|\n    |2        |      3|1598726492913|2021-08-19 00:00:00|50527|  0|19416|C3-5|QPV|21\u00b0  |None |  1.463459515469516|  17.4133659842749|\n    |3        |      4|1413615572689|2021-11-05 00:00:00|50181|  1|18637|C4-2|GCL|6\u00b0   |ABC  |  2.084002469706324| 0.489481411683505|\n    |4        |      5| 245266998048|2021-09-25 00:00:00|50378|  0|29756|C5-4|LGY|6\u00b0   |XYZ  | 0.5141579343276079| 8.550780816571438|\n    |5        |      6| 947994853644|2021-10-14 00:00:00|50511|  0| 7890|C2-4|BET|0\u00b0   |XYZ  | 1.1725893606177542| 7.447314130260951|\n    |6        |      7|2230693047809|2021-10-07 00:00:00|50987|  1|26742|C1-3|CFP|0\u00b0   |XYZ  | 1.0921267279498004|11.009210185311993|\n    |...      |...    |...          |...                |...  |...|...  |... |...|...  |...  |...                |...               |\n    |7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883|  1|15687|C3-1|RFR|None |XYZ  | 1.3467185981566827|17.023443485654845|\n    |7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152|  0|16556|C4-2|WTC|None |ABC  | 1.1517593924478968| 8.201818634721487|\n    |7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008|  1|14590|C1-3|WYM|0\u00b0   |None | 2.1273836233717978|23.295943554889195|\n    |7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173|  1|20744|C2-1|ZYO|6\u00b0   |ABC  |  2.482509134693724| 22.25375464857266|\n    |7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052|  1|28298|C5-4|XAW|None |XYZ  |0.17923757926558143|23.728160892974252|\n    |7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993|  1|24832|C5-2|UDL|None |ABC  |0.08425329763360942|12.707735293126758|\n    |7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510|  0|15819|C3-4|IGY|None |ABC  |  1.066241687256579|13.862069804070295|\n    +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n    ```\n\n    Args:\n        rows (int, optional): number of rows wanted. Defaults to 100_000.\n\n    Returns:\n        Table (Table): Populated table.\n    \"\"\"  # noqa\n    rows = int(rows)\n\n    L1 = [\"None\", \"0\u00b0\", \"6\u00b0\", \"21\u00b0\"]\n    L2 = [\"ABC\", \"XYZ\", \"\"]\n\n    t = Table()\n    assert isinstance(t, Table)\n    for page_n in range(math.ceil(rows / Config.PAGE_SIZE)):  # n pages\n        start = (page_n * Config.PAGE_SIZE)\n        end = min(start + Config.PAGE_SIZE, rows)\n        ro = range(start, end)\n\n        t2 = Table()\n        t2[\"#\"] = [v+1 for v in ro]\n        # 1 - mock orderid\n        t2[\"1\"] = [random.randint(18_778_628_504, 2277_772_117_504) for i in ro]\n        # 2 - mock delivery date.\n        t2[\"2\"] = [datetime.fromordinal(random.randint(738000, 738150)).isoformat() for i in ro]\n        # 3 - mock store id.\n        t2[\"3\"] = [random.randint(50000, 51000) for _ in ro]\n        # 4 - random bit.\n        t2[\"4\"] = [random.randint(0, 1) for _ in ro]\n        # 5 - mock product id\n        t2[\"5\"] = [random.randint(3000, 30000) for _ in ro]\n        # 6 - random weird string\n        t2[\"6\"] = [f\"C{random.randint(1, 5)}-{random.randint(1, 5)}\" for _ in ro]\n        # 7 - # random category\n        t2[\"7\"] = [\"\".join(random.choice(ascii_uppercase) for _ in range(3)) for _ in ro]\n        # 8 -random temperature group.\n        t2[\"8\"] = [random.choice(L1) for _ in ro]\n        # 9 - random choice of category\n        t2[\"9\"] = [random.choice(L2) for _ in ro]\n        # 10 - volume?\n        t2[\"10\"] = [random.uniform(0.01, 2.5) for _ in ro]\n        # 11 - units?\n        t2[\"11\"] = [f\"{random.uniform(0.1, 25)}\" for _ in ro]\n\n        if len(t) == 0:\n            t = t2\n        else:\n            t += t2\n\n    return t\n</code></pre>"},{"location":"reference/datatypes/","title":"Datatypes","text":""},{"location":"reference/datatypes/#tablite.datatypes","title":"<code>tablite.datatypes</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.matched_types","title":"<code>tablite.datatypes.matched_types = {int: DataTypes._infer_int, str: DataTypes._infer_str, float: DataTypes._infer_float, bool: DataTypes._infer_bool, date: DataTypes._infer_date, datetime: DataTypes._infer_datetime, time: DataTypes._infer_time}</code>  <code>module-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes-classes","title":"Classes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes","title":"<code>tablite.datatypes.DataTypes</code>","text":"<p>             Bases: <code>object</code></p> <p>DataTypes is the conversion library for all datatypes.</p> <p>It supports any / all python datatypes.</p>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.int","title":"<code>tablite.datatypes.DataTypes.int = int</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.str","title":"<code>tablite.datatypes.DataTypes.str = str</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.float","title":"<code>tablite.datatypes.DataTypes.float = float</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bool","title":"<code>tablite.datatypes.DataTypes.bool = bool</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date","title":"<code>tablite.datatypes.DataTypes.date = date</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime","title":"<code>tablite.datatypes.DataTypes.datetime = datetime</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.time","title":"<code>tablite.datatypes.DataTypes.time = time</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.timedelta","title":"<code>tablite.datatypes.DataTypes.timedelta = timedelta</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.numeric_types","title":"<code>tablite.datatypes.DataTypes.numeric_types = {int, float, date, time, datetime}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch","title":"<code>tablite.datatypes.DataTypes.epoch = datetime(2000, 1, 1, 0, 0, 0, 0, timezone.utc)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch_no_tz","title":"<code>tablite.datatypes.DataTypes.epoch_no_tz = datetime(2000, 1, 1, 0, 0, 0, 0)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.digits","title":"<code>tablite.datatypes.DataTypes.digits = '1234567890'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.decimals","title":"<code>tablite.datatypes.DataTypes.decimals = set('1234567890-+eE.')</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.integers","title":"<code>tablite.datatypes.DataTypes.integers = set('1234567890-+')</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.nones","title":"<code>tablite.datatypes.DataTypes.nones = {'null', 'Null', 'NULL', '#N/A', '#n/a', '', 'None', None, np.nan}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.none_type","title":"<code>tablite.datatypes.DataTypes.none_type = type(None)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bytes_functions","title":"<code>tablite.datatypes.DataTypes.bytes_functions = {type(None): b_none, bool: b_bool, int: b_int, float: b_float, str: b_str, bytes: b_bytes, datetime: b_datetime, date: b_date, time: b_time, timedelta: b_timedelta}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code_functions","title":"<code>tablite.datatypes.DataTypes.type_code_functions = {1: _none, 2: _bool, 3: _int, 4: _float, 5: _str, 6: _bytes, 7: _datetime, 8: _date, 9: _time, 10: _timedelta, 11: _unpickle}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pytype_from_type_code","title":"<code>tablite.datatypes.DataTypes.pytype_from_type_code = {1: type(None), 2: bool, 3: int, 4: float, 5: str, 6: bytes, 7: datetime, 8: date, 9: time, 10: timedelta, 11: 'pickled object'}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date_formats","title":"<code>tablite.datatypes.DataTypes.date_formats = {'NNNN-NN-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-NN-N': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-N': lambda x: date(*int(i) for i in x.split('-')), 'NN-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NN-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NNNN.NN.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.NN.N': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.N': lambda x: date(*int(i) for i in x.split('.')), 'NN.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NN.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NNNN/NN/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/NN/N': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/N': lambda x: date(*int(i) for i in x.split('/')), 'NN/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NN/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NNNN NN NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN NN N': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N N': lambda x: date(*int(i) for i in x.split(' ')), 'NN NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NN N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NNNNNNNN': lambda x: date(*(int(x[:4]), int(x[4:6]), int(x[6:])))}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime_formats","title":"<code>tablite.datatypes.DataTypes.datetime_formats = {'NNNN-NN-NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN-NN-NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN/NN/NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN/NN/NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN NN NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN NN NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN.NN.NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NNNN.NN.NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NN-NN-NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN/NN/NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN/NN/NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN NN NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN.NN.NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NNNNNNNNTNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, compact=3)}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.types","title":"<code>tablite.datatypes.DataTypes.types = [datetime, date, time, int, bool, float, str]</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code","title":"<code>tablite.datatypes.DataTypes.type_code(value)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef type_code(cls, value):\n    if type(value) in cls._type_codes:\n        return cls._type_codes[type(value)]\n    elif hasattr(value, \"dtype\"):\n        dtype = pytype(value)\n        return cls._type_codes[dtype]\n    else:\n        return cls._type_codes[\"pickle\"]\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_none","title":"<code>tablite.datatypes.DataTypes.b_none(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_none(v):\n    return b\"None\"\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bool","title":"<code>tablite.datatypes.DataTypes.b_bool(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_bool(v):\n    return bytes(str(v), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_int","title":"<code>tablite.datatypes.DataTypes.b_int(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_int(v):\n    return bytes(str(v), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_float","title":"<code>tablite.datatypes.DataTypes.b_float(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_float(v):\n    return bytes(str(v), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_str","title":"<code>tablite.datatypes.DataTypes.b_str(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_str(v):\n    return v.encode(\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bytes","title":"<code>tablite.datatypes.DataTypes.b_bytes(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_bytes(v):\n    return v\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_datetime","title":"<code>tablite.datatypes.DataTypes.b_datetime(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_datetime(v):\n    return bytes(v.isoformat(), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_date","title":"<code>tablite.datatypes.DataTypes.b_date(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_date(v):\n    return bytes(v.isoformat(), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_time","title":"<code>tablite.datatypes.DataTypes.b_time(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_time(v):\n    return bytes(v.isoformat(), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_timedelta","title":"<code>tablite.datatypes.DataTypes.b_timedelta(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_timedelta(v):\n    return bytes(str(float(v.days + (v.seconds / (24 * 60 * 60)))), \"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_pickle","title":"<code>tablite.datatypes.DataTypes.b_pickle(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_pickle(v):\n    return pickle.dumps(v, protocol=0)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_bytes","title":"<code>tablite.datatypes.DataTypes.to_bytes(v)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef to_bytes(cls, v):\n    if type(v) in cls.bytes_functions:  # it's a python native type\n        f = cls.bytes_functions[type(v)]\n    elif hasattr(v, \"dtype\"):  # it's a numpy/c type.\n        dtype = pytype(v)\n        f = cls.bytes_functions[dtype]\n    else:\n        f = cls.b_pickle\n    return f(v)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_type_code","title":"<code>tablite.datatypes.DataTypes.from_type_code(value, code)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef from_type_code(cls, value, code):\n    f = cls.type_code_functions[code]\n    return f(value)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pattern_to_datetime","title":"<code>tablite.datatypes.DataTypes.pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False)</code>  <code>staticmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False):\n    assert isinstance(iso_string, str)\n    if compact:\n        s = iso_string\n        if compact == 1:  # has T\n            slices = [\n                (0, 4, \"-\"),\n                (4, 6, \"-\"),\n                (6, 8, \"T\"),\n                (9, 11, \":\"),\n                (11, 13, \":\"),\n                (13, len(s), \"\"),\n            ]\n        elif compact == 2:  # has no T.\n            slices = [\n                (0, 4, \"-\"),\n                (4, 6, \"-\"),\n                (6, 8, \"T\"),\n                (8, 10, \":\"),\n                (10, 12, \":\"),\n                (12, len(s), \"\"),\n            ]\n        elif compact == 3:  # has T and :\n            slices = [\n                (0, 4, \"-\"),\n                (4, 6, \"-\"),\n                (6, 8, \"T\"),\n                (9, 11, \":\"),\n                (12, 14, \":\"),\n                (15, len(s), \"\"),\n            ]\n        else:\n            raise TypeError\n        iso_string = \"\".join([s[a:b] + c for a, b, c in slices if b &lt;= len(s)])\n        iso_string = iso_string.rstrip(\":\")\n\n    if day_first:\n        s = iso_string\n        iso_string = \"\".join((s[6:10], \"-\", s[3:5], \"-\", s[0:2], s[10:]))\n\n    if \",\" in iso_string:\n        iso_string = iso_string.replace(\",\", \".\")\n\n    dot = iso_string[::-1].find(\".\")\n    if 0 &lt; dot &lt; 10:\n        ix = len(iso_string) - dot\n        microsecond = int(float(f\"0{iso_string[ix - 1:]}\") * 10**6)\n        # fmt:off\n        iso_string = iso_string[: len(iso_string) - dot] + str(microsecond).rjust(6, \"0\")\n        # fmt:on\n    if ymd:\n        iso_string = iso_string.replace(ymd, \"-\", 2)\n    if T:\n        iso_string = iso_string.replace(T, \"T\")\n    return datetime.fromisoformat(iso_string)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.round","title":"<code>tablite.datatypes.DataTypes.round(value, multiple, up=None)</code>  <code>classmethod</code>","text":"<p>a nicer way to round numbers.</p> PARAMETER  DESCRIPTION <code>value</code> <p>value to be rounded</p> <p> TYPE: <code>(float, integer, datetime)</code> </p> <code>multiple</code> <p>value to be used as the based of rounding. 1) multiple = 1 is the same as rounding to whole integers. 2) multiple = 0.001 is the same as rounding to 3 digits precision. 3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415 4) value = datetime(2022,8,18,11,14,53,440) 5) multiple = timedelta(hours=0.5) 6) xround(value,multiple) is datetime(2022,8,18,11,0)</p> <p> TYPE: <code>(float, integer, timedelta)</code> </p> <code>up</code> <p>None (default) or boolean rounds half, up or down. round(1.6, 1) rounds to 2. round(1.4, 1) rounds to 1. round(1.5, 1, up=True) rounds to 2. round(1.5, 1, up=False) rounds to 1.</p> <p> TYPE: <code>(None, bool)</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <p>float,integer,datetime: rounded value in same type as input.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef round(cls, value, multiple, up=None):\n    \"\"\"a nicer way to round numbers.\n\n    Args:\n        value (float,integer,datetime): value to be rounded\n\n        multiple (float,integer,timedelta): value to be used as the based of rounding.\n            1) multiple = 1 is the same as rounding to whole integers.\n            2) multiple = 0.001 is the same as rounding to 3 digits precision.\n            3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415\n            4) value = datetime(2022,8,18,11,14,53,440)\n            5) multiple = timedelta(hours=0.5)\n            6) xround(value,multiple) is datetime(2022,8,18,11,0)\n\n        up (None, bool, optional):\n            None (default) or boolean rounds half, up or down.\n            round(1.6, 1) rounds to 2.\n            round(1.4, 1) rounds to 1.\n            round(1.5, 1, up=True) rounds to 2.\n            round(1.5, 1, up=False) rounds to 1.\n\n    Returns:\n        float,integer,datetime: rounded value in same type as input.\n    \"\"\"\n    epoch = 0\n    if isinstance(value, (datetime)) and isinstance(multiple, timedelta):\n        if value.tzinfo is None:\n            epoch = cls.epoch_no_tz\n        else:\n            epoch = cls.epoch\n\n    value2 = value - epoch\n    if value2 == 0:\n        return value2\n\n    low = (value2 // multiple) * multiple\n    high = low + multiple\n    if up is True:\n        return high + epoch\n    elif up is False:\n        return low + epoch\n    else:\n        if abs((high + epoch) - value) &lt; abs(value - (low + epoch)):\n            return high + epoch\n        else:\n            return low + epoch\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_json","title":"<code>tablite.datatypes.DataTypes.to_json(v)</code>  <code>staticmethod</code>","text":"<p>converts any python type to json.</p> PARAMETER  DESCRIPTION <code>v</code> <p>value to convert to json</p> <p> TYPE: <code>any</code> </p> RETURNS DESCRIPTION <p>json compatible value from v</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef to_json(v):\n    \"\"\"converts any python type to json.\n\n    Args:\n        v (any): value to convert to json\n\n    Returns:\n        json compatible value from v\n    \"\"\"\n    if hasattr(v, \"dtype\"):\n        v = numpy_to_python(v)\n    if v is None:\n        return v\n    elif v is False:\n        # using isinstance(v, bool): won't work as False also is int of zero.\n        return str(v)\n    elif v is True:\n        return str(v)\n    elif isinstance(v, int):\n        return v\n    elif isinstance(v, str):\n        return v\n    elif isinstance(v, float):\n        return v\n    elif isinstance(v, datetime):\n        return v.isoformat()\n    elif isinstance(v, time):\n        return v.isoformat()\n    elif isinstance(v, date):\n        return v.isoformat()\n    elif isinstance(v, timedelta):\n        return f\"P{v.days}DT{v.seconds + (v.microseconds / 1e6)}S\"\n    else:\n        raise TypeError(f\"The datatype {type(v)} is not supported.\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_json","title":"<code>tablite.datatypes.DataTypes.from_json(v, dtype)</code>  <code>staticmethod</code>","text":"<p>converts json to python datatype</p> PARAMETER  DESCRIPTION <code>v</code> <p>value</p> <p> TYPE: <code>any</code> </p> <code>dtype</code> <p>any python type</p> <p> TYPE: <code>python type</code> </p> RETURNS DESCRIPTION <p>python type of value v</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef from_json(v, dtype):\n    \"\"\"converts json to python datatype\n\n    Args:\n        v (any): value\n        dtype (python type): any python type\n\n    Returns:\n        python type of value v\n    \"\"\"\n    if v in DataTypes.nones:\n        if dtype is str and v == \"\":\n            return \"\"\n        else:\n            return None\n    if dtype is int:\n        return int(v)\n    elif dtype is str:\n        return str(v)\n    elif dtype is float:\n        return float(v)\n    elif dtype is bool:\n        if v == \"False\":\n            return False\n        elif v == \"True\":\n            return True\n        else:\n            raise ValueError(v)\n    elif dtype is date:\n        return date.fromisoformat(v)\n    elif dtype is datetime:\n        return datetime.fromisoformat(v)\n    elif dtype is time:\n        return time.fromisoformat(v)\n    elif dtype is timedelta:\n        L = v.split(\"DT\")\n        days = int(L[0].lstrip(\"P\"))\n        seconds = float(L[1].rstrip(\"S\"))\n        return timedelta(days, seconds)\n    else:\n        raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess_types","title":"<code>tablite.datatypes.DataTypes.guess_types(*values)</code>  <code>staticmethod</code>","text":"<p>Attempts to guess the datatype for *values returns dict with matching datatypes and probabilities</p> RETURNS DESCRIPTION <code>dict</code> <p>{key: type, value: probability}</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef guess_types(*values):\n    \"\"\"Attempts to guess the datatype for *values\n    returns dict with matching datatypes and probabilities\n\n    Returns:\n        dict: {key: type, value: probability}\n    \"\"\"\n    d = defaultdict(int)\n    probability = Rank(DataTypes.types[:])\n\n    for value in values:\n        if hasattr(value, \"dtype\"):\n            value = numpy_to_python(value)\n\n        for dtype in probability:\n            try:\n                _ = DataTypes.infer(value, dtype)\n                d[dtype] += 1\n                probability.match(dtype)\n                break\n            except (ValueError, TypeError):\n                pass\n    if not d:\n        d[str] = len(values)\n    return {k: round(v / len(values), 3) for k, v in d.items()}\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess","title":"<code>tablite.datatypes.DataTypes.guess(*values)</code>  <code>staticmethod</code>","text":"<p>Makes a best guess the datatype for *values returns list of native python values</p> RETURNS DESCRIPTION <code>list</code> <p>list of native python values</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef guess(*values):\n    \"\"\"Makes a best guess the datatype for *values\n    returns list of native python values\n\n    Returns:\n        list: list of native python values\n    \"\"\"\n    probability = Rank(*DataTypes.types[:])\n    matches = [None for _ in values[0]]\n\n    for ix, value in enumerate(values[0]):\n        if hasattr(value, \"dtype\"):\n            value = numpy_to_python(value)\n        for dtype in probability:\n            try:\n                matches[ix] = DataTypes.infer(value, dtype)\n                probability.match(dtype)\n                break\n            except (ValueError, TypeError):\n                pass\n    return matches\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.infer","title":"<code>tablite.datatypes.DataTypes.infer(v, dtype)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef infer(cls, v, dtype):\n    if isinstance(v, str) and dtype == str:\n        # we got a string, we're trying to infer it to string, we shouldn't check for None-ness\n        return v\n\n    if v in DataTypes.nones:\n        return None\n\n    if dtype not in matched_types:\n        raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n\n    return matched_types[dtype](v)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.Rank","title":"<code>tablite.datatypes.Rank(*items)</code>","text":"<p>             Bases: <code>object</code></p> Source code in <code>tablite/datatypes.py</code> <pre><code>def __init__(self, *items):\n    self.items = {i: ix for i, ix in zip(items, range(len(items)))}\n    self.ranks = [0 for _ in items]\n    self.items_list = [i for i in items]\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.Rank-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items","title":"<code>tablite.datatypes.Rank.items = {i: ixfor (i, ix) in zip(items, range(len(items)))}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.ranks","title":"<code>tablite.datatypes.Rank.ranks = [0 for _ in items]</code>  <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items_list","title":"<code>tablite.datatypes.Rank.items_list = [i for i in items]</code>  <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.match","title":"<code>tablite.datatypes.Rank.match(k)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def match(self, k):  # k+=1\n    ix = self.items[k]\n    r = self.ranks\n    r[ix] += 1\n\n    if ix &gt; 0:\n        p = self.items_list\n        while (\n            r[ix] &gt; r[ix - 1] and ix &gt; 0\n        ):  # use a simple bubble sort to maintain rank\n            r[ix], r[ix - 1] = r[ix - 1], r[ix]\n            p[ix], p[ix - 1] = p[ix - 1], p[ix]\n            old = p[ix]\n            self.items[old] = ix\n            self.items[k] = ix - 1\n            ix -= 1\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.Rank.__iter__","title":"<code>tablite.datatypes.Rank.__iter__()</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def __iter__(self):\n    return iter(self.items_list)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.MetaArray","title":"<code>tablite.datatypes.MetaArray</code>","text":"<p>             Bases: <code>ndarray</code></p> <p>Array with metadata.</p>"},{"location":"reference/datatypes/#tablite.datatypes.MetaArray-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__new__","title":"<code>tablite.datatypes.MetaArray.__new__(array, dtype=None, order=None, **kwargs)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def __new__(cls, array, dtype=None, order=None, **kwargs):\n    obj = np.asarray(array, dtype=dtype, order=order).view(cls)\n    obj.metadata = kwargs\n    return obj\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__array_finalize__","title":"<code>tablite.datatypes.MetaArray.__array_finalize__(obj)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def __array_finalize__(self, obj):\n    if obj is None:\n        return\n    self.metadata = getattr(obj, \"metadata\", None)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.numpy_to_python","title":"<code>tablite.datatypes.numpy_to_python(obj: Any) -&gt; Any</code>","text":"<p>Converts numpy types to python types.</p> <p>See https://numpy.org/doc/stable/reference/arrays.scalars.html</p> PARAMETER  DESCRIPTION <code>obj</code> <p>A numpy object</p> <p> TYPE: <code>Any</code> </p> RETURNS DESCRIPTION <code>Any</code> <p>python object: A python object</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def numpy_to_python(obj: Any) -&gt; Any:\n    \"\"\"Converts numpy types to python types.\n\n    See https://numpy.org/doc/stable/reference/arrays.scalars.html\n\n    Args:\n        obj (Any): A numpy object\n\n    Returns:\n        python object: A python object\n    \"\"\"\n    if isinstance(obj, np.generic):\n        return obj.item()\n    return obj\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.pytype","title":"<code>tablite.datatypes.pytype(obj)</code>","text":"<p>Returns the python type of any object</p> PARAMETER  DESCRIPTION <code>obj</code> <p>any numpy or python object</p> <p> TYPE: <code>Any</code> </p> RETURNS DESCRIPTION <code>type</code> <p>type of obj</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def pytype(obj):\n    \"\"\"Returns the python type of any object\n\n    Args:\n        obj (Any): any numpy or python object\n\n    Returns:\n        type: type of obj\n    \"\"\"\n    if isinstance(obj, np.generic):\n        return type(obj.item())\n    return type(obj)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.pytype_from_iterable","title":"<code>tablite.datatypes.pytype_from_iterable(iterable: {tuple, list}) -&gt; {np.dtype, dict}</code>","text":"<p>helper to make correct np array from python types.</p> PARAMETER  DESCRIPTION <code>iterable</code> <p>values to be converted to numpy array.</p> <p> TYPE: <code>(tuple, list)</code> </p> RAISES DESCRIPTION <code>NotImplementedError</code> <p>if datatype is not supported.</p> RETURNS DESCRIPTION <code>{dtype, dict}</code> <p>np.dtype: python type of the iterable.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def pytype_from_iterable(iterable: {tuple, list}) -&gt; {np.dtype, dict}:\n    \"\"\"helper to make correct np array from python types.\n\n    Args:\n        iterable (tuple,list): values to be converted to numpy array.\n\n    Raises:\n        NotImplementedError: if datatype is not supported.\n\n    Returns:\n        np.dtype: python type of the iterable.\n    \"\"\"\n    py_types = {}\n    if isinstance(iterable, (tuple, list)):\n        type_counter = Counter((pytype(v) for v in iterable))\n\n        for k, v in type_counter.items():\n            py_types[k] = v\n\n        if len(py_types) == 0:\n            np_dtype, py_dtype = object, bool\n        elif len(py_types) == 1:\n            py_dtype = list(py_types.keys())[0]\n            if py_dtype == datetime:\n                np_dtype = np.datetime64\n            elif py_dtype == date:\n                np_dtype = np.datetime64\n            elif py_dtype == timedelta:\n                np_dtype = np.timedelta64\n            else:\n                np_dtype = None\n        else:\n            np_dtype = object\n    elif isinstance(iterable, np.ndarray):\n        if iterable.dtype == object:\n            np_dtype = object\n            py_types = dict(Counter((pytype(v) for v in iterable)))\n        else:\n            np_dtype = iterable.dtype\n            if len(iterable) &gt; 0:\n                py_types = {pytype(iterable[0]): len(iterable)}\n            else:\n                py_types = {pytype(np_dtype.type()): len(iterable)}\n    else:\n        raise NotImplementedError(f\"No handler for {type(iterable)}\")\n\n    return np_dtype, py_types\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.list_to_np_array","title":"<code>tablite.datatypes.list_to_np_array(iterable)</code>","text":"<p>helper to make correct np array from python types. Example of problem where numpy turns mixed types into strings.</p> <p>np.array([4, '5']) np.ndarray(['4', '5'])</p> RETURNS DESCRIPTION <p>np.array</p> <p>datatypes</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def list_to_np_array(iterable):\n    \"\"\"helper to make correct np array from python types.\n    Example of problem where numpy turns mixed types into strings.\n    &gt;&gt;&gt; np.array([4, '5'])\n    np.ndarray(['4', '5'])\n\n    returns:\n        np.array\n        datatypes\n    \"\"\"\n    np_dtype, py_dtype = pytype_from_iterable(iterable)\n\n    value = MetaArray(iterable, dtype=np_dtype, py_dtype=py_dtype)\n    return value\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.np_type_unify","title":"<code>tablite.datatypes.np_type_unify(arrays)</code>","text":"<p>unifies numpy types.</p> PARAMETER  DESCRIPTION <code>arrays</code> <p>List of numpy arrays</p> <p> TYPE: <code>list</code> </p> RETURNS DESCRIPTION <p>np.ndarray: numpy array of a single type.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def np_type_unify(arrays):\n    \"\"\"unifies numpy types.\n\n    Args:\n        arrays (list): List of numpy arrays\n\n    Returns:\n        np.ndarray: numpy array of a single type.\n    \"\"\"\n    dtypes = {arr.dtype: len(arr) for arr in arrays}\n    if len(dtypes) == 1:\n        dtype, _ = dtypes.popitem()\n    else:\n        for ix, arr in enumerate(arrays):\n            arrays[ix] = np.array(arr, dtype=object)\n        dtype = object\n    return np.concatenate(arrays, dtype=dtype)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.multitype_set","title":"<code>tablite.datatypes.multitype_set(arr)</code>","text":"<p>prevents loss of True, False when calling sets.</p> <p>python looses values when called returning a set. Example:</p> <p>{1, True, 0, False}</p> PARAMETER  DESCRIPTION <code>arr</code> <p>iterable of mixed types.</p> <p> TYPE: <code>Iterable</code> </p> RETURNS DESCRIPTION <p>np.array: with unique values.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def multitype_set(arr):\n    \"\"\"prevents loss of True, False when calling sets.\n\n    python looses values when called returning a set. Example:\n    &gt;&gt;&gt; {1, True, 0, False}\n    {0,1}\n\n    Args:\n        arr (Iterable): iterable of mixed types.\n\n    Returns:\n        np.array: with unique values.\n    \"\"\"\n    L = [(type(v), v) for v in arr]\n    L = list(set(L))\n    L = [v for _, v in L]\n    return np.array(L, dtype=object)\n</code></pre>"},{"location":"reference/diff/","title":"Diff","text":""},{"location":"reference/diff/#tablite.diff","title":"<code>tablite.diff</code>","text":""},{"location":"reference/diff/#tablite.diff-classes","title":"Classes","text":""},{"location":"reference/diff/#tablite.diff-functions","title":"Functions","text":""},{"location":"reference/diff/#tablite.diff.diff","title":"<code>tablite.diff.diff(T, other, columns=None)</code>","text":"<p>compares table self with table other</p> PARAMETER  DESCRIPTION <code>self</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>other</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>columns</code> <p>list of column names to include in comparison. Defaults to None.</p> <p> TYPE: <code>List</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>diff of self and other with diff in columns 1st and 2nd.</p> Source code in <code>tablite/diff.py</code> <pre><code>def diff(T, other, columns=None):\n    \"\"\"compares table self with table other\n\n    Args:\n        self (Table): Table\n        other (Table): Table\n        columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n    Returns:\n        Table: diff of self and other with diff in columns 1st and 2nd.\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    sub_cls_check(other, BaseTable)\n    if columns is None:\n        columns = [name for name in T.columns if name in other.columns]\n    elif isinstance(columns, list) and all(isinstance(i, str) for i in columns):\n        for name in columns:\n            if name not in T.columns:\n                raise ValueError(f\"column '{name}' not found\")\n            if name not in other.columns:\n                raise ValueError(f\"column '{name}' not found\")\n    else:\n        raise TypeError(\"Expected list of column names\")\n\n    t1 = T[columns]\n    if issubclass(type(t1), BaseTable):\n        t1 = [tuple(r) for r in T.rows]\n    else:\n        t1 = list(T)\n    t2 = other[columns]\n    if issubclass(type(t2), BaseTable):\n        t2 = [tuple(r) for r in other.rows]\n    else:\n        t2 = list(other)\n\n    sm = difflib.SequenceMatcher(None, t1, t2)\n    new = type(T)()\n    first = unique_name(\"1st\", columns)\n    second = unique_name(\"2nd\", columns)\n    new.add_columns(*columns + [first, second])\n\n    news = {n: [] for n in new.columns}  # Cache for Work in progress.\n\n    for opc, t1a, t1b, t2a, t2b in sm.get_opcodes():\n        if opc == \"insert\":\n            for name, col in zip(columns, zip(*t2[t2a:t2b])):\n                news[name].extend(col)\n            news[first] += [\"-\"] * (t2b - t2a)\n            news[second] += [\"+\"] * (t2b - t2a)\n\n        elif opc == \"delete\":\n            for name, col in zip(columns, zip(*t1[t1a:t1b])):\n                news[name].extend(col)\n            news[first] += [\"+\"] * (t1b - t1a)\n            news[second] += [\"-\"] * (t1b - t1a)\n\n        elif opc == \"equal\":\n            for name, col in zip(columns, zip(*t2[t2a:t2b])):\n                news[name].extend(col)\n            news[first] += [\"=\"] * (t2b - t2a)\n            news[second] += [\"=\"] * (t2b - t2a)\n\n        elif opc == \"replace\":\n            for name, col in zip(columns, zip(*t2[t2a:t2b])):\n                news[name].extend(col)\n            news[first] += [\"r\"] * (t2b - t2a)\n            news[second] += [\"r\"] * (t2b - t2a)\n\n        else:\n            pass\n\n        # Clear cache to free up memory.\n        if len(news[first]) &gt; Config.PAGE_SIZE == 0:\n            for name, L in news.items():\n                new[name].extend(np.array(L))\n                L.clear()\n\n    for name, L in news.items():\n        new[name].extend(np.array(L))\n        L.clear()\n    return new\n</code></pre>"},{"location":"reference/export_utils/","title":"Export utils","text":""},{"location":"reference/export_utils/#tablite.export_utils","title":"<code>tablite.export_utils</code>","text":""},{"location":"reference/export_utils/#tablite.export_utils-classes","title":"Classes","text":""},{"location":"reference/export_utils/#tablite.export_utils-functions","title":"Functions","text":""},{"location":"reference/export_utils/#tablite.export_utils.to_sql","title":"<code>tablite.export_utils.to_sql(table, name)</code>","text":"<p>generates ANSI-92 compliant SQL.</p> PARAMETER  DESCRIPTION <code>name</code> <p>name of SQL table.</p> <p> TYPE: <code>str</code> </p> Source code in <code>tablite/export_utils.py</code> <pre><code>def to_sql(table, name):\n    \"\"\"\n    generates ANSI-92 compliant SQL.\n\n    args:\n        name (str): name of SQL table.\n    \"\"\"\n    sub_cls_check(table, BaseTable)\n    type_check(name, str)\n\n    prefix = name\n    name = \"T1\"\n    create_table = \"\"\"CREATE TABLE {} ({})\"\"\"\n    columns = []\n    for name, col in table.columns.items():\n        dtype = col.types()\n        if len(dtype) == 1:\n            dtype, _ = dtype.popitem()\n            if dtype is int:\n                dtype = \"INTEGER\"\n            elif dtype is float:\n                dtype = \"REAL\"\n            else:\n                dtype = \"TEXT\"\n        else:\n            dtype = \"TEXT\"\n        definition = f\"{name} {dtype}\"\n        columns.append(definition)\n\n    create_table = create_table.format(prefix, \", \".join(columns))\n\n    # return create_table\n    row_inserts = []\n    for row in table.rows:\n        row_inserts.append(str(tuple([i if i is not None else \"NULL\" for i in row])))\n    row_inserts = f\"INSERT INTO {prefix} VALUES \" + \",\".join(row_inserts)\n    return \"begin; {}; {}; commit;\".format(create_table, row_inserts)\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_pandas","title":"<code>tablite.export_utils.to_pandas(table)</code>","text":"<p>returns pandas.DataFrame</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def to_pandas(table):\n    \"\"\"\n    returns pandas.DataFrame\n    \"\"\"\n    sub_cls_check(table, BaseTable)\n    try:\n        return pd.DataFrame(table.to_dict())  # noqa\n    except ImportError:\n        import pandas as pd  # noqa\n    return pd.DataFrame(table.to_dict())  # noqa\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_hdf5","title":"<code>tablite.export_utils.to_hdf5(table, path)</code>","text":"<p>creates a copy of the table as hdf5</p> <p>Note that some loss of type information is to be expected in columns of mixed type:</p> <p>t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def to_hdf5(table, path):\n    # fmt: off\n    \"\"\"\n    creates a copy of the table as hdf5\n\n    Note that some loss of type information is to be expected in columns of mixed type:\n    &gt;&gt;&gt; t.show(dtype=True)\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int|\n    +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    &gt;&gt;&gt; t.to_hdf5(filename)\n    &gt;&gt;&gt; t2 = Table.from_hdf5(filename)\n    &gt;&gt;&gt; t2.show(dtype=True)\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int|\n    +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    \"\"\"\n    # fmt: in\n    import h5py\n\n    sub_cls_check(table, BaseTable)\n    type_check(path, Path)\n\n    total = f\"{len(table.columns) * len(table):,}\"  # noqa\n    print(f\"writing {total} records to {path}\", end=\"\")\n\n    with h5py.File(path, \"w\") as f:\n        n = 0\n        for name, col in table.items():\n            try:\n                f.create_dataset(name, data=col[:])  # stored in hdf5 as '/name'\n            except TypeError:\n                f.create_dataset(name, data=[str(i) for i in col[:]])  # stored in hdf5 as '/name'\n            n += 1\n    print(\"... done\")\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.excel_writer","title":"<code>tablite.export_utils.excel_writer(table, path)</code>","text":"<p>writer for excel files.</p> <p>This can create xlsx files beyond Excels. If you're using pyexcel to read the data, you'll see the data is there. If you're using Excel, Excel will stop loading after 1,048,576 rows.</p> <p>See pyexcel for more details: http://docs.pyexcel.org/</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def excel_writer(table, path):\n    \"\"\"\n    writer for excel files.\n\n    This can create xlsx files beyond Excels.\n    If you're using pyexcel to read the data, you'll see the data is there.\n    If you're using Excel, Excel will stop loading after 1,048,576 rows.\n\n    See pyexcel for more details:\n    http://docs.pyexcel.org/\n    \"\"\"\n    import pyexcel\n\n    sub_cls_check(table, BaseTable)\n    type_check(path, Path)\n\n    def gen(table):  # local helper\n        yield table.columns\n        for row in table.rows:\n            yield row\n\n    data = list(gen(table))\n    if path.suffix in [\".xls\", \".ods\"]:\n        data = [\n            [str(v) if (isinstance(v, (int, float)) and abs(v) &gt; 2**32 - 1) else DataTypes.to_json(v) for v in row]\n            for row in data\n        ]\n\n    pyexcel.save_as(array=data, dest_file_name=str(path))\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_json","title":"<code>tablite.export_utils.to_json(table, *args, **kwargs)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def to_json(table, *args, **kwargs):\n    import json\n\n    sub_cls_check(table, BaseTable)\n    return json.dumps(table.as_json_serializable())\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.path_suffix_check","title":"<code>tablite.export_utils.path_suffix_check(path, kind)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def path_suffix_check(path, kind):\n    if not path.suffix == kind:\n        raise ValueError(f\"Suffix mismatch: Expected {kind}, got {path.suffix} in {path.name}\")\n    if not path.parent.exists():\n        raise FileNotFoundError(f\"directory {path.parent} not found.\")\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.text_writer","title":"<code>tablite.export_utils.text_writer(table, path, tqdm=_tqdm)</code>","text":"<p>exports table to csv, tsv or txt dependening on path suffix. follows the JSON norm. text escape is ON for all strings.</p>"},{"location":"reference/export_utils/#tablite.export_utils.text_writer--note","title":"Note:","text":"<p>If the delimiter is present in a string when the string is exported, text-escape is required, as the format otherwise is corrupted. When the file is being written, it is unknown whether any string in a column contrains the delimiter. As text escaping the few strings that may contain the delimiter would lead to an assymmetric format, the safer guess is to text escape all strings.</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def text_writer(table, path, tqdm=_tqdm):\n    \"\"\"exports table to csv, tsv or txt dependening on path suffix.\n    follows the JSON norm. text escape is ON for all strings.\n\n    Note:\n    ----------------------\n    If the delimiter is present in a string when the string is exported,\n    text-escape is required, as the format otherwise is corrupted.\n    When the file is being written, it is unknown whether any string in\n    a column contrains the delimiter. As text escaping the few strings\n    that may contain the delimiter would lead to an assymmetric format,\n    the safer guess is to text escape all strings.\n    \"\"\"\n    sub_cls_check(table, BaseTable)\n    type_check(path, Path)\n\n    def txt(value):  # helper for text writer\n        if value is None:\n            return \"\"  # A column with 1,None,2 must be \"1,,2\".\n        elif isinstance(value, str):\n            # if not (value.startswith('\"') and value.endswith('\"')):\n            #     return f'\"{value}\"'  # this must be escape: \"the quick fox, jumped over the comma\"\n            # else:\n            return value  # this would for example be an empty string: \"\"\n        else:\n            return str(DataTypes.to_json(value))  # this handles datetimes, timedelta, etc.\n\n    delimiters = {\".csv\": \",\", \".tsv\": \"\\t\", \".txt\": \"|\"}\n    delimiter = delimiters.get(path.suffix)\n\n    with path.open(\"w\", encoding=\"utf-8\") as fo:\n        fo.write(delimiter.join(c for c in table.columns) + \"\\n\")\n        for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE):\n            fo.write(delimiter.join(txt(c) for c in row) + \"\\n\")\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.sql_writer","title":"<code>tablite.export_utils.sql_writer(table, path)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def sql_writer(table, path):\n    type_check(table, BaseTable)\n    type_check(path, Path)\n    with path.open(\"w\", encoding=\"utf-8\") as fo:\n        fo.write(to_sql(table))\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.json_writer","title":"<code>tablite.export_utils.json_writer(table, path)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def json_writer(table, path):\n    type_check(table, BaseTable)\n    type_check(path, Path)\n    with path.open(\"w\") as fo:\n        fo.write(to_json(table))\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_html","title":"<code>tablite.export_utils.to_html(table, path)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def to_html(table, path):\n    type_check(table, BaseTable)\n    type_check(path, Path)\n    with path.open(\"w\", encoding=\"utf-8\") as fo:\n        fo.write(table._repr_html_(slice(0, len(table))))\n</code></pre>"},{"location":"reference/file_reader_utils/","title":"File reader utils","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils","title":"<code>tablite.file_reader_utils</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ENCODING_GUESS_BYTES","title":"<code>tablite.file_reader_utils.ENCODING_GUESS_BYTES = 10000</code>  <code>module-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.header_readers","title":"<code>tablite.file_reader_utils.header_readers = {'fods': excel_reader_headers, 'json': excel_reader_headers, 'simple': excel_reader_headers, 'rst': excel_reader_headers, 'mediawiki': excel_reader_headers, 'xlsx': excel_reader_headers, 'xlsm': excel_reader_headers, 'csv': text_reader_headers, 'tsv': text_reader_headers, 'txt': text_reader_headers, 'ods': ods_reader_headers}</code>  <code>module-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-classes","title":"Classes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape","title":"<code>tablite.file_reader_utils.TextEscape(openings='({[', closures=']})', text_qualifier='\"', delimiter=',', strip_leading_and_tailing_whitespace=False)</code>","text":"<p>             Bases: <code>object</code></p> <p>enables parsing of CSV with respecting brackets and text marks.</p> <p>Example: text_escape = TextEscape()  # set up the instance. for line in somefile.readlines():     list_of_words = text_escape(line)  # use the instance.     ...</p> <p>As an example, the Danes and Germans use \" for inches and ' for feet, so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so for this case ( and ) are valid escapes, but \" and ' aren't.</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def __init__(\n    self,\n    openings=\"({[\",\n    closures=\"]})\",\n    text_qualifier='\"',\n    delimiter=\",\",\n    strip_leading_and_tailing_whitespace=False,\n):\n    \"\"\"\n    As an example, the Danes and Germans use \" for inches and ' for feet,\n    so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so\n    for this case ( and ) are valid escapes, but \" and ' aren't.\n\n    \"\"\"\n    if openings is None:\n        openings = [None]\n    elif isinstance(openings, str):\n        self.openings = {c for c in openings}\n    else:\n        raise TypeError(f\"expected str, got {type(openings)}\")\n\n    if closures is None:\n        closures = [None]\n    elif isinstance(closures, str):\n        self.closures = {c for c in closures}\n    else:\n        raise TypeError(f\"expected str, got {type(closures)}\")\n\n    if not isinstance(delimiter, str):\n        raise TypeError(f\"expected str, got {type(delimiter)}\")\n    self.delimiter = delimiter\n    self._delimiter_length = len(delimiter)\n    self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n\n    if text_qualifier is None:\n        pass\n    elif text_qualifier in openings + closures:\n        raise ValueError(\"It's a bad idea to have qoute character appears in openings or closures.\")\n    else:\n        self.qoute = text_qualifier\n\n    if not text_qualifier:\n        if not self.strip_leading_and_tailing_whitespace:\n            self.c = self._call_1\n        else:\n            self.c = self._call_2\n    else:\n        self.c = self._call_3\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.openings","title":"<code>tablite.file_reader_utils.TextEscape.openings = {c for c in openings}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.closures","title":"<code>tablite.file_reader_utils.TextEscape.closures = {c for c in closures}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.delimiter","title":"<code>tablite.file_reader_utils.TextEscape.delimiter = delimiter</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace","title":"<code>tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.qoute","title":"<code>tablite.file_reader_utils.TextEscape.qoute = text_qualifier</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.c","title":"<code>tablite.file_reader_utils.TextEscape.c = self._call_1</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.__call__","title":"<code>tablite.file_reader_utils.TextEscape.__call__(s)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def __call__(self, s):\n    return self.c(s)\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.split_by_sequence","title":"<code>tablite.file_reader_utils.split_by_sequence(text, sequence)</code>","text":"<p>helper to split text according to a split sequence.</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def split_by_sequence(text, sequence):\n    \"\"\"helper to split text according to a split sequence.\"\"\"\n    chunks = tuple()\n    for element in sequence:\n        idx = text.find(element)\n        if idx &lt; 0:\n            raise ValueError(f\"'{element}' not in row\")\n        chunk, text = text[:idx], text[len(element) + idx :]\n        chunks += (chunk,)\n    chunks += (text,)  # the remaining text.\n    return chunks\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.detect_seperator","title":"<code>tablite.file_reader_utils.detect_seperator(text)</code>","text":"<p>:param path: pathlib.Path objects :param encoding: file encoding. :return: 1 character.</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def detect_seperator(text):\n    \"\"\"\n    :param path: pathlib.Path objects\n    :param encoding: file encoding.\n    :return: 1 character.\n    \"\"\"\n    # After reviewing the logic in the CSV sniffer, I concluded that all it\n    # really does is to look for a non-text character. As the separator is\n    # determined by the first line, which almost always is a line of headers,\n    # the text characters will be utf-8,16 or ascii letters plus white space.\n    # This leaves the characters ,;:| and \\t as potential separators, with one\n    # exception: files that use whitespace as separator. My logic is therefore\n    # to (1) find the set of characters that intersect with ',;:|\\t' which in\n    # practice is a single character, unless (2) it is empty whereby it must\n    # be whitespace.\n    if len(text) == 0:\n        return None\n    seps = {\",\", \"\\t\", \";\", \":\", \"|\"}.intersection(text)\n    if not seps:\n        if \" \" in text:\n            return \" \"\n        if \"\\n\" in text:\n            return \"\\n\"\n        else:\n            raise ValueError(\"separator not detected\")\n    if len(seps) == 1:\n        return seps.pop()\n    else:\n        frq = [(text.count(i), i) for i in seps]\n        frq.sort(reverse=True)  # most frequent first.\n        return frq[0][-1]\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.text_reader_headers","title":"<code>tablite.file_reader_utils.text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n    d = {}\n    delimiters = {\n        \".csv\": \",\",\n        \".tsv\": \"\\t\",\n        \".txt\": None,\n    }\n\n    try:\n        with path.open(\"rb\") as fi:\n            rawdata = fi.read(ENCODING_GUESS_BYTES)\n            encoding = chardet.detect(rawdata)[\"encoding\"]\n\n        if delimiter is None:\n            with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n                lines = []\n                for n, line in enumerate(fi, -header_row_index):\n                    if n &lt; 0:\n                        continue\n                    line = line.rstrip(\"\\n\")\n                    lines.append(line)\n                    if n &gt;= linecount:\n                        break  # break on first\n                try:\n                    d[\"delimiter\"] = delimiter = detect_seperator(\"\\n\".join(lines))\n                except ValueError as e:\n                    if e.args == (\"separator not detected\", ):\n                        d[\"delimiter\"] = delimiter = None # this will handle the case of 1 column, 1 row\n                    else:\n                        raise e\n\n        if delimiter is None:\n            d[\"delimiter\"] = delimiter = delimiters[path.suffix]  # pickup the default one\n            d[path.name] = [lines]\n            d[\"is_empty\"] = True  # mark as empty to return an empty table instead of throwing\n        else:\n            kwargs = {}\n\n            if text_qualifier is not None:\n                kwargs[\"text_qualifier\"] = text_qualifier\n                kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n            else:\n                kwargs[\"quoting\"] = \"QUOTE_NONE\"\n\n            d[path.name] = _get_headers(\n                str(path), py_to_nim_encoding(encoding), header_row_index=header_row_index,\n                delimiter=delimiter,\n                linecount=linecount,\n                **kwargs\n            )\n        return d\n    except Exception as e:\n        raise ValueError(f\"can't read {path.suffix}\")\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.excel_reader_headers","title":"<code>tablite.file_reader_utils.excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n    d = {}\n    book = openpyxl.open(str(path), read_only=True)\n\n    try:\n        all_sheets = book.sheetnames\n\n        for sheet_name, sheet in ((name, book[name]) for name in all_sheets):\n            fixup_worksheet(sheet)\n            if sheet.max_row is None:\n                max_rows = 0\n            else:\n                max_rows = min(sheet.max_row, linecount + 1)\n            container = [None] * max_rows\n            padding_ends = 0\n            max_column = sheet.max_column\n\n            for i, row_data in enumerate(sheet.iter_rows(0, header_row_index + max_rows, values_only=True), start=-header_row_index):\n                if i &lt; 0:\n                    # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row\n                    continue\n\n                # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string\n                container[i] = [DataTypes.to_json(v) for v in row_data]\n\n                for j, cell in enumerate(reversed(row_data)):\n                    if cell is None:\n                        continue\n\n                    padding_ends = max(padding_ends, max_column - j)\n\n                    break\n\n            d[sheet_name] = [None if c is None else c[0:padding_ends] for c in container]\n            d[\"delimiter\"] = None\n    finally:\n        book.close()\n\n    return d\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ods_reader_headers","title":"<code>tablite.file_reader_utils.ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n    d = {\n        \"delimiter\": None\n    }\n    sheets = pyexcel.get_book_dict(file_name=str(path))\n\n    for sheet_name, data in sheets.items():\n        lines = [[DataTypes.to_json(v) for v in row] for row in data[header_row_index:header_row_index+linecount]]\n\n        d[sheet_name] = lines\n\n    return d\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_headers","title":"<code>tablite.file_reader_utils.get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10)</code>","text":"<p>file format definition csv     comma separated values tsv     tab separated values csvz        a zip file that contains one or many csv files tsvz        a zip file that contains one or many tsv files xls     a spreadsheet file format created by MS-Excel 97-2003 xlsx        MS-Excel Extensions to the Office Open XML SpreadsheetML File Format. xlsm        an MS-Excel Macro-Enabled Workbook file ods     open document spreadsheet fods        flat open document spreadsheet json        java script object notation html        html table of the data structure simple      simple presentation rst     rStructured Text presentation of the data mediawiki   media wiki table</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10):\n    \"\"\"\n    file format\tdefinition\n    csv\t    comma separated values\n    tsv\t    tab separated values\n    csvz\ta zip file that contains one or many csv files\n    tsvz\ta zip file that contains one or many tsv files\n    xls\t    a spreadsheet file format created by MS-Excel 97-2003\n    xlsx\tMS-Excel Extensions to the Office Open XML SpreadsheetML File Format.\n    xlsm\tan MS-Excel Macro-Enabled Workbook file\n    ods\t    open document spreadsheet\n    fods\tflat open document spreadsheet\n    json\tjava script object notation\n    html\thtml table of the data structure\n    simple\tsimple presentation\n    rst\t    rStructured Text presentation of the data\n    mediawiki\tmedia wiki table\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    if not isinstance(path, Path):\n        raise TypeError(\"expected pathlib path.\")\n    if not path.exists():\n        raise FileNotFoundError(str(path))\n    if delimiter is not None:\n        if not isinstance(delimiter, str):\n            raise TypeError(f\"expected str or None, not {type(delimiter)}\")\n\n    kwargs = {\n        \"path\": path,\n        \"delimiter\": delimiter,\n        \"header_row_index\": header_row_index,\n        \"text_qualifier\": text_qualifier,\n        \"linecount\": linecount\n   }\n\n    reader = header_readers.get(path.suffix[1:], None)\n\n    if reader is None:\n        raise TypeError(f\"file format for headers not supported: {path.suffix}\")\n\n    result = reader(**kwargs)\n\n    return result\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_encoding","title":"<code>tablite.file_reader_utils.get_encoding(path, nbytes=ENCODING_GUESS_BYTES)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def get_encoding(path, nbytes=ENCODING_GUESS_BYTES):\n    nbytes = min(nbytes, path.stat().st_size)\n    with path.open(\"rb\") as fi:\n        rawdata = fi.read(nbytes)\n        encoding = chardet.detect(rawdata)[\"encoding\"]\n        if encoding == \"ascii\":  # utf-8 is backwards compatible with ascii\n            return \"utf-8\"  # --   so should the first 10k chars not be enough,\n        return encoding  # --      the utf-8 encoding will still get it right.\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_delimiter","title":"<code>tablite.file_reader_utils.get_delimiter(path, encoding)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def get_delimiter(path, encoding):\n    with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n        lines = []\n        for n, line in enumerate(fi):\n            line = line.rstrip(\"\\n\")\n            lines.append(line)\n            if n &gt; 10:\n                break  # break on first\n        delimiter = detect_seperator(\"\\n\".join(lines))\n        if delimiter is None:\n            raise ValueError(\"Delimiter could not be determined\")\n        return delimiter\n</code></pre>"},{"location":"reference/groupby_utils/","title":"Groupby utils","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils","title":"<code>tablite.groupby_utils</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils-classes","title":"Classes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy","title":"<code>tablite.groupby_utils.GroupBy</code>","text":"<p>             Bases: <code>object</code></p>"},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy-attributes","title":"Attributes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.max","title":"<code>tablite.groupby_utils.GroupBy.max = 'Max'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.min","title":"<code>tablite.groupby_utils.GroupBy.min = 'Min'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.sum","title":"<code>tablite.groupby_utils.GroupBy.sum = 'Sum'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.product","title":"<code>tablite.groupby_utils.GroupBy.product = 'Product'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.first","title":"<code>tablite.groupby_utils.GroupBy.first = 'First'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.last","title":"<code>tablite.groupby_utils.GroupBy.last = 'Last'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count","title":"<code>tablite.groupby_utils.GroupBy.count = 'Count'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count_unique","title":"<code>tablite.groupby_utils.GroupBy.count_unique = 'CountUnique'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.avg","title":"<code>tablite.groupby_utils.GroupBy.avg = 'Average'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.stdev","title":"<code>tablite.groupby_utils.GroupBy.stdev = 'StandardDeviation'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.median","title":"<code>tablite.groupby_utils.GroupBy.median = 'Median'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.mode","title":"<code>tablite.groupby_utils.GroupBy.mode = 'Mode'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/","title":"Import utils","text":""},{"location":"reference/import_utils/#tablite.import_utils","title":"<code>tablite.import_utils</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.file_readers","title":"<code>tablite.import_utils.file_readers = {'fods': excel_reader, 'json': excel_reader, 'html': from_html, 'hdf5': from_hdf5, 'simple': excel_reader, 'rst': excel_reader, 'mediawiki': excel_reader, 'xlsx': excel_reader, 'xls': excel_reader, 'xlsm': excel_reader, 'csv': text_reader, 'tsv': text_reader, 'txt': text_reader, 'ods': ods_reader}</code>  <code>module-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.valid_readers","title":"<code>tablite.import_utils.valid_readers = ','.join(list(file_readers.keys()))</code>  <code>module-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils-classes","title":"Classes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig","title":"<code>tablite.import_utils.TRconfig(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields)</code>","text":"<p>             Bases: <code>object</code></p> Source code in <code>tablite/import_utils.py</code> <pre><code>def __init__(\n    self,\n    source,\n    destination,\n    start,\n    end,\n    guess_datatypes,\n    delimiter,\n    text_qualifier,\n    text_escape_openings,\n    text_escape_closures,\n    strip_leading_and_tailing_whitespace,\n    encoding,\n    newline_offsets,\n    fields\n) -&gt; None:\n    self.source = source\n    self.destination = destination\n    self.start = start\n    self.end = end\n    self.guess_datatypes = guess_datatypes\n    self.delimiter = delimiter\n    self.text_qualifier = text_qualifier\n    self.text_escape_openings = text_escape_openings\n    self.text_escape_closures = text_escape_closures\n    self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n    self.encoding = encoding\n    self.newline_offsets = newline_offsets\n    self.fields = fields\n    type_check(start, int),\n    type_check(end, int),\n    type_check(delimiter, str),\n    type_check(text_qualifier, (str, type(None))),\n    type_check(text_escape_openings, str),\n    type_check(text_escape_closures, str),\n    type_check(encoding, str),\n    type_check(strip_leading_and_tailing_whitespace, bool),\n    type_check(newline_offsets, list)\n    type_check(fields, dict)\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.source","title":"<code>tablite.import_utils.TRconfig.source = source</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.destination","title":"<code>tablite.import_utils.TRconfig.destination = destination</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.start","title":"<code>tablite.import_utils.TRconfig.start = start</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.end","title":"<code>tablite.import_utils.TRconfig.end = end</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.guess_datatypes","title":"<code>tablite.import_utils.TRconfig.guess_datatypes = guess_datatypes</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.delimiter","title":"<code>tablite.import_utils.TRconfig.delimiter = delimiter</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_qualifier","title":"<code>tablite.import_utils.TRconfig.text_qualifier = text_qualifier</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_openings","title":"<code>tablite.import_utils.TRconfig.text_escape_openings = text_escape_openings</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_closures","title":"<code>tablite.import_utils.TRconfig.text_escape_closures = text_escape_closures</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace","title":"<code>tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.encoding","title":"<code>tablite.import_utils.TRconfig.encoding = encoding</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.newline_offsets","title":"<code>tablite.import_utils.TRconfig.newline_offsets = newline_offsets</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.fields","title":"<code>tablite.import_utils.TRconfig.fields = fields</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.copy","title":"<code>tablite.import_utils.TRconfig.copy()</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def copy(self):\n    return TRconfig(**self.dict())\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.dict","title":"<code>tablite.import_utils.TRconfig.dict()</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def dict(self):\n    return {k: v for k, v in self.__dict__.items() if not (k.startswith(\"_\") or callable(v))}\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.from_pandas","title":"<code>tablite.import_utils.from_pandas(T, df)</code>","text":"<p>Creates Table using pd.to_dict('list')</p> <p>similar to:</p> <p>import pandas as pd df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}) df     a  b     0  1  4     1  2  5     2  3  6 df.to_dict('list')</p> <p>t = Table.from_dict(df.to_dict('list)) t.show()     +===+===+===+     | # | a | b |     |row|int|int|     +---+---+---+     | 0 |  1|  4|     | 1 |  2|  5|     | 2 |  3|  6|     +===+===+===+</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def from_pandas(T, df):\n    \"\"\"\n    Creates Table using pd.to_dict('list')\n\n    similar to:\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n    &gt;&gt;&gt; df\n        a  b\n        0  1  4\n        1  2  5\n        2  3  6\n    &gt;&gt;&gt; df.to_dict('list')\n    {'a': [1, 2, 3], 'b': [4, 5, 6]}\n\n    &gt;&gt;&gt; t = Table.from_dict(df.to_dict('list))\n    &gt;&gt;&gt; t.show()\n        +===+===+===+\n        | # | a | b |\n        |row|int|int|\n        +---+---+---+\n        | 0 |  1|  4|\n        | 1 |  2|  5|\n        | 2 |  3|  6|\n        +===+===+===+\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n\n    return T(columns=df.to_dict(\"list\"))  # noqa\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.from_hdf5","title":"<code>tablite.import_utils.from_hdf5(T, path, tqdm=_tqdm, pbar=None)</code>","text":"<p>imports an exported hdf5 table.</p> <p>Note that some loss of type information is to be expected in columns of mixed type:</p> <p>t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def from_hdf5(T, path, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    imports an exported hdf5 table.\n\n    Note that some loss of type information is to be expected in columns of mixed type:\n    &gt;&gt;&gt; t.show(dtype=True)\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int|\n    +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    &gt;&gt;&gt; t.to_hdf5(filename)\n    &gt;&gt;&gt; t2 = Table.from_hdf5(filename)\n    &gt;&gt;&gt; t2.show(dtype=True)\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int|\n    +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n    import h5py\n\n    type_check(path, Path)\n    t = T()\n    with h5py.File(path, \"r\") as h5:\n        for col_name in h5.keys():\n            dset = h5[col_name]\n            arr = np.array(dset[:])\n            if arr.dtype == object:\n                arr = np.array(DataTypes.guess([v.decode(\"utf-8\") for v in arr]))\n            t[col_name] = arr\n    return t\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.from_json","title":"<code>tablite.import_utils.from_json(T, jsn)</code>","text":"<p>Imports tables exported using .to_json</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def from_json(T, jsn):\n    \"\"\"\n    Imports tables exported using .to_json\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n    import json\n\n    type_check(jsn, str)\n    d = json.loads(jsn)\n    return T(columns=d[\"columns\"])\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.from_html","title":"<code>tablite.import_utils.from_html(T, path, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def from_html(T, path, tqdm=_tqdm, pbar=None):\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n    type_check(path, Path)\n\n    if pbar is None:\n        total = path.stat().st_size\n        pbar = tqdm(total=total, desc=\"from_html\", disable=Config.TQDM_DISABLE)\n\n    row_start, row_end = \"&lt;tr&gt;\", \"&lt;/tr&gt;\"\n    value_start, value_end = \"&lt;th&gt;\", \"&lt;/th&gt;\"\n    chunk = \"\"\n    t = None  # will be T()\n    start, end = 0, 0\n    data = {}\n    with path.open(\"r\") as fi:\n        while True:\n            start = chunk.find(row_start, start)  # row tag start\n            end = chunk.find(row_end, end)  # row tag end\n            if start == -1 or end == -1:\n                new = fi.read(100_000)\n                pbar.update(len(new))\n                if new == \"\":\n                    break\n                chunk += new\n                continue\n            # get indices from chunk\n            row = chunk[start + len(row_start) : end]\n            fields = [v.rstrip(value_end) for v in row.split(value_start)]\n            if not data:\n                headers = fields[:]\n                data = {f: [] for f in headers}\n                continue\n            else:\n                for field, header in zip(fields, headers):\n                    data[header].append(field)\n\n            chunk = chunk[end + len(row_end) :]\n\n            if len(data[headers[0]]) == Config.PAGE_SIZE:\n                if t is None:\n                    t = T(columns=data)\n                else:\n                    for k, v in data.items():\n                        t[k].extend(DataTypes.guess(v))\n                data = {f: [] for f in headers}\n\n    for k, v in data.items():\n        t[k].extend(DataTypes.guess(v))\n    return t\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.excel_reader","title":"<code>tablite.import_utils.excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs)</code>","text":"<p>returns Table from excel</p> <p>**kwargs are excess arguments that are ignored.</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs):\n    \"\"\"\n    returns Table from excel\n\n    **kwargs are excess arguments that are ignored.\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n\n    book = openpyxl.load_workbook(path, read_only=True, data_only=True)\n\n    if sheet is None:  # help the user.\n        \"\"\"\n            If no sheet specified, assume first sheet.\n\n            Reasoning:\n                Pandas ODS reader does that, so this preserves parity and it might be expected by users.\n                If we don't know the sheet name but only have single sheet,\n                    we would need to take extra steps to find out the name of the sheet.\n                We already make assumptions in case of column selection,\n                    when columns are None, we import all of them.\n        \"\"\"\n        sheet = book.sheetnames[0]\n    elif sheet not in book.sheetnames:\n        raise ValueError(f\"sheet not found: {sheet}\")\n\n    if not (isinstance(start, int) and start &gt;= 0):\n        raise ValueError(\"expected start as an integer &gt;=0\")\n    if not (isinstance(limit, int) and limit &gt; 0):\n        raise ValueError(\"expected limit as integer &gt; 0\")\n\n    worksheet = book[sheet]\n    fixup_worksheet(worksheet)\n\n    try:\n        it_header = worksheet.iter_rows(min_row=header_row_index + 1)\n        while True:\n            # get the first row to know our headers or the number of columns\n            row = [c.value for c in next(it_header)]\n            break\n        fields = [str(c) if c is not None else \"\" for c in row] # excel is offset by 1\n    except StopIteration:\n        # excel was empty, return empty table\n        return T()\n\n    if not first_row_has_headers:\n        # since the first row did not contain headers, we use the column count to populate header names\n        fields = [str(i) for i in range(len(fields))]\n\n    if columns is None:\n        # no columns were specified by user to import, that means we import all of the them\n        columns = []\n\n        for f in fields:\n            # fixup the duplicate column names\n            columns.append(unique_name(f, columns))\n\n        field_dict = {k: i for i, k in enumerate(columns)}\n    else:\n        field_dict = {}\n\n        for k, i in ((k, fields.index(k)) for k in columns):\n            # fixup the duplicate column names\n            field_dict[unique_name(k, field_dict.keys())] = i\n\n    # calculate our data rows iterator offset\n    it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1\n\n    # attempt to fetch number of rows in the sheet\n    total_rows = worksheet.max_row\n    real_tqdm = True\n\n    if total_rows is None:\n        # i don't know what causes it but max_row can be None in some cases, so we don't know how large the dataset is\n        total_rows = it_offset + limit\n        real_tqdm = False\n\n    # create the actual data rows iterator\n    it_rows = worksheet.iter_rows(min_row=it_offset, max_row=min(it_offset+limit, total_rows))\n    it_used_indices = list(field_dict.values())\n\n    # filter columns that we're not going to use\n    it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows)\n\n    # create page directory\n    workdir = Path(Config.workdir) / Config.pid\n    pagesdir = workdir/\"pages\"\n    pagesdir.mkdir(exist_ok=True, parents=True)\n\n    field_names = list(field_dict.keys())\n    column_count = len(field_names)\n\n    page_fhs = None\n\n    # prepopulate the table with columns\n    table = T()\n    for name in field_names:\n        table[name] = Column(table.path)\n\n    pbar_fname = path.name\n    if len(pbar_fname) &gt; 20:\n        pbar_fname = pbar_fname[0:10] + \"...\" + pbar_fname[-7:]\n\n    if real_tqdm:\n        # we can create a true tqdm progress bar, make one\n        tqdm_iter = tqdm(it_rows_filtered, total=total_rows, desc=f\"importing excel: {pbar_fname}\")\n    else:\n        \"\"\"\n            openpyxls was unable to precalculate the size of the excel for whatever reason\n            forcing recalc would require parsing entire file\n            drop the progress bar in that case, just show iterations\n\n            as an alternative we can use \u03a3=1/x but it just doesn't look good, show iterations per second instead\n        \"\"\"\n        tqdm_iter = tqdm(it_rows_filtered, desc=f\"importing excel: {pbar_fname}\")\n\n    tqdm_iter = iter(tqdm_iter)\n\n    idx = 0\n\n    while True:\n        try:\n            row = next(tqdm_iter)\n        except StopIteration:\n            break # because in some cases we can't know the size of excel to set the upper iterator limit we loop until stop iteration is encountered\n\n        if skip_empty == \"ALL\" and all(v is None for v in row):\n            continue\n        elif skip_empty == \"ANY\" and any(v is None for v in row):\n            continue\n\n        if idx % Config.PAGE_SIZE == 0:\n            if page_fhs is not None:\n                # we reached the max page file size, fix the pages\n                [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n            page_fhs = [None] * column_count\n\n            for cidx in range(column_count):\n                # allocate new pages\n                pg_path = pagesdir / f\"{next(Page.ids)}.npy\"\n                page_fhs[cidx] = open(pg_path, \"wb\")\n\n        for fh, value in zip(page_fhs, row):\n            \"\"\"\n                since excel types are already cast into appropriate type we're going to do two passes per page\n\n                we create our temporary custom format:\n                packed type|packed byte count|packed bytes|...\n\n                available types:\n                    * q - int64\n                    * d - float64\n                    * s - string\n                    * b - boolean\n                    * n - none\n                    * p - pickled (date, time, datetime)\n            \"\"\"\n            dtype = type(value)\n\n            if dtype == int:\n                ptype, bytes_ = b'q', struct.pack('q', value) # pack int as int64\n            elif dtype == float:\n                ptype, bytes_ = b'd', struct.pack('d', value) # pack float as float64\n            elif dtype == str:\n                ptype, bytes_ = b's', value.encode(\"utf-8\")   # pack string\n            elif dtype == bool:\n                ptype, bytes_ = b'b', b'1' if value else b'0' # pack boolean\n            elif value is None:\n                ptype, bytes_ = b'n', b''                     # pack none\n            elif dtype in [date, time, datetime]:\n                ptype, bytes_ = b'p', pkl.dumps(value)        # pack object types via pickle\n            else:\n                raise NotImplementedError()\n\n            byte_count = struct.pack('I', len(bytes_))        # pack our payload size, i doubt payload size can be over uint32\n\n            # dump object to file\n            fh.write(ptype)\n            fh.write(byte_count)\n            fh.write(bytes_)\n\n        idx = idx + 1\n\n    if page_fhs is not None:\n        # we reached end of the loop, fix the pages\n        [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n    return table\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.ods_reader","title":"<code>tablite.import_utils.ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, **kwargs)</code>","text":"<p>returns Table from .ODS</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, **kwargs):\n    \"\"\"\n    returns Table from .ODS\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n\n    if sheet is None:\n        data = read_excel(str(path), header=None) # selects first sheet\n    else:\n        data = read_excel(str(path), sheet_name=sheet, header=None)\n\n    data[isna(data)] = None  # convert any empty cells to None\n    data = data.to_numpy().tolist() # convert pandas to list\n\n    if skip_empty == \"ALL\" or skip_empty == \"ANY\":\n        \"\"\" filter out all rows based on predicate that come after header row \"\"\"\n        fn_filter = any if skip_empty == \"ALL\" else all # this is intentional\n        data = [\n            row\n            for ridx, row in enumerate(data)\n            if ridx &lt; header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)\n        ]\n\n    data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes\n\n    if not (isinstance(start, int) and start &gt;= 0):\n        raise ValueError(\"expected start as an integer &gt;=0\")\n    if not (isinstance(limit, int) and limit &gt; 0):\n        raise ValueError(\"expected limit as integer &gt; 0\")\n\n    t = T()\n\n    used_columns_names = set()\n    for ix, value in enumerate(data[header_row_index]):\n        if first_row_has_headers:\n            header, start_row_pos = \"\" if value is None else str(value), (1 + header_row_index)\n        else:\n            header, start_row_pos = f\"_{ix + 1}\", (0 + header_row_index)\n\n        if columns is not None:\n            if header not in columns:\n                continue\n\n        unique_column_name = unique_name(str(header), used_columns_names)\n        used_columns_names.add(unique_column_name)\n\n        column_values = data[start_row_pos : start_row_pos + limit, ix]\n\n        t[unique_column_name] = column_values\n    return t\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.text_reader_task","title":"<code>tablite.import_utils.text_reader_task(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields)</code>","text":"<p>PARALLEL TASK FUNCTION reads columnsname + path[start:limit] into hdf5.</p> <p>source: csv or txt file destination: filename for page. start: int: start of page. end: int: end of page. guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess delimiter: ',' ';' or '|' text_qualifier: str: commonly \" text_escape_openings: str: default: \"({[ text_escape_closures: str: default: ]})\" strip_leading_and_tailing_whitespace: bool encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def text_reader_task(\n    source,\n    destination,\n    start,\n    end,\n    guess_datatypes,\n    delimiter,\n    text_qualifier,\n    text_escape_openings,\n    text_escape_closures,\n    strip_leading_and_tailing_whitespace,\n    encoding,\n    newline_offsets,\n    fields\n):\n    \"\"\"PARALLEL TASK FUNCTION\n    reads columnsname + path[start:limit] into hdf5.\n\n    source: csv or txt file\n    destination: filename for page.\n    start: int: start of page.\n    end: int: end of page.\n    guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess\n    delimiter: ',' ';' or '|'\n    text_qualifier: str: commonly \\\"\n    text_escape_openings: str: default: \"({[\n    text_escape_closures: str: default: ]})\"\n    strip_leading_and_tailing_whitespace: bool\n    encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')\n    \"\"\"\n    if isinstance(source, str):\n        source = Path(source)\n    type_check(source, Path)\n    if not source.exists():\n        raise FileNotFoundError(f\"File not found: {source}\")\n    type_check(destination, list)\n\n    # declare CSV dialect.\n    delim = delimiter\n\n    class Dialect(csv.Dialect):\n        delimiter = delim\n        quotechar = '\"' if text_qualifier is None else text_qualifier\n        escapechar = '\\\\'\n        doublequote = True\n        quoting = csv.QUOTE_MINIMAL\n        skipinitialspace = False if strip_leading_and_tailing_whitespace is None else strip_leading_and_tailing_whitespace\n        lineterminator = \"\\n\"\n\n    with source.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:  # --READ\n        fi.seek(newline_offsets[start])\n        reader = csv.reader(fi, dialect=Dialect)\n\n        # if there's an issue with file handlers on windows, we can make a special case for windows where the file is opened on demand and appended instead of opening all handlers at once\n        page_file_handlers = [open(f, mode=\"wb\") for f in destination]\n\n        # identify longest str\n        longest_str = [1 for _ in range(len(destination))]\n        for row in (next(reader) for _ in range(end - start)):\n            for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n                longest_str[idx] = max(longest_str[idx], len(c))\n\n        column_formats = [f\"&lt;U{i}\" for i in longest_str]\n        for idx, cf in enumerate(column_formats):\n            _create_numpy_header(cf, (end - start, ), page_file_handlers[idx])\n\n        # write page arrays to files\n        fi.seek(newline_offsets[start])\n        for row in (next(reader) for _ in range(end - start)):\n            for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n                cbytes = np.asarray(c, dtype=column_formats[idx]).tobytes()\n                page_file_handlers[idx].write(cbytes)\n\n        [phf.close() for phf in page_file_handlers]\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.text_reader","title":"<code>tablite.import_utils.text_reader(T, path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline, guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, delimiter, text_escape_openings, text_escape_closures, tqdm=_tqdm, **kwargs)</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def text_reader(\n    T,\n    path,\n    columns,\n    first_row_has_headers,\n    header_row_index,\n    encoding,\n    start,\n    limit,\n    newline,\n    guess_datatypes,\n    text_qualifier,\n    strip_leading_and_tailing_whitespace,\n    skip_empty,\n    delimiter,\n    text_escape_openings,\n    text_escape_closures,\n    tqdm=_tqdm,\n    **kwargs,\n):\n    if encoding is None:\n        encoding = get_encoding(path, nbytes=ENCODING_GUESS_BYTES)\n\n    enc = py_to_nim_encoding(encoding)\n    pid = Config.workdir / Config.pid\n    kwargs = {}\n\n    if first_row_has_headers is not None:\n        kwargs[\"first_row_has_headers\"] = first_row_has_headers\n    if header_row_index is not None:\n        kwargs[\"header_row_index\"] = header_row_index\n    if columns is not None:\n        kwargs[\"columns\"] = columns\n    if start is not None:\n        kwargs[\"start\"] = start\n    if limit is not None and limit != sys.maxsize:\n        kwargs[\"limit\"] = limit\n    if guess_datatypes is not None:\n        kwargs[\"guess_datatypes\"] = guess_datatypes\n    if newline is not None:\n        kwargs[\"newline\"] = newline\n    if delimiter is not None:\n        kwargs[\"delimiter\"] = delimiter\n    if text_qualifier is not None:\n        kwargs[\"text_qualifier\"] = text_qualifier\n        kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n    else:\n        kwargs[\"quoting\"] = \"QUOTE_NONE\"\n    if strip_leading_and_tailing_whitespace is not None:\n        kwargs[\"strip_leading_and_tailing_whitespace\"] = strip_leading_and_tailing_whitespace\n\n    if skip_empty is None:\n        kwargs[\"skip_empty\"] = \"NONE\"\n    else:\n        kwargs[\"skip_empty\"] = skip_empty\n\n    return nimlite.text_reader(\n        T, pid, path, enc,\n        **kwargs,\n        tqdm=tqdm\n    )\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils-modules","title":"Modules","text":""},{"location":"reference/imputation/","title":"Imputation","text":""},{"location":"reference/imputation/#tablite.imputation","title":"<code>tablite.imputation</code>","text":""},{"location":"reference/imputation/#tablite.imputation-classes","title":"Classes","text":""},{"location":"reference/imputation/#tablite.imputation-functions","title":"Functions","text":""},{"location":"reference/imputation/#tablite.imputation.imputation","title":"<code>tablite.imputation.imputation(T, targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm, pbar=None)</code>","text":"<p>In statistics, imputation is the process of replacing missing data with substituted values.</p> <p>See more: https://en.wikipedia.org/wiki/Imputation_(statistics)</p> PARAMETER  DESCRIPTION <code>table</code> <p>source table.</p> <p> TYPE: <code>Table</code> </p> <code>targets</code> <p>column names to find and replace missing values</p> <p> TYPE: <code>str or list of strings</code> </p> <code>missing</code> <p>values to be replaced.</p> <p> TYPE: <code>None or iterable</code> DEFAULT: <code>None</code> </p> <code>method</code> <p>method to be used for replacement. Options:</p> <p>'carry forward':     takes the previous value, and carries forward into fields     where values are missing.     +: quick. Realistic on time series.     -: Can produce strange outliers.</p> <p>'mean':     calculates the column mean (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: doesn't work on text. Causes data set to drift towards the mean.</p> <p>'mode':     calculates the column mode (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: most frequent value becomes over-represented in the sample</p> <p>'nearest neighbour':     calculates normalised distance between items in source columns     selects nearest neighbour and copies value as replacement.     +: works for any datatype.     -: computationally intensive (e.g. slow)</p> <p> TYPE: <code>str</code> DEFAULT: <code>'carry forward'</code> </p> <code>sources</code> <p>NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used.</p> <p> TYPE: <code>list of strings</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>table</code> <p>table with replaced values.</p> Source code in <code>tablite/imputation.py</code> <pre><code>def imputation(T, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    In statistics, imputation is the process of replacing missing data with substituted values.\n\n    See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n    Args:\n        table (Table): source table.\n\n        targets (str or list of strings): column names to find and\n            replace missing values\n\n        missing (None or iterable): values to be replaced.\n\n        method (str): method to be used for replacement. Options:\n\n            'carry forward':\n                takes the previous value, and carries forward into fields\n                where values are missing.\n                +: quick. Realistic on time series.\n                -: Can produce strange outliers.\n\n            'mean':\n                calculates the column mean (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: doesn't work on text. Causes data set to drift towards the mean.\n\n            'mode':\n                calculates the column mode (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: most frequent value becomes over-represented in the sample\n\n            'nearest neighbour':\n                calculates normalised distance between items in source columns\n                selects nearest neighbour and copies value as replacement.\n                +: works for any datatype.\n                -: computationally intensive (e.g. slow)\n\n        sources (list of strings): NEAREST NEIGHBOUR ONLY\n            column names to be used during imputation.\n            if None or empty, all columns will be used.\n\n    Returns:\n        table: table with replaced values.\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if isinstance(targets, str) and targets not in T.columns:\n        targets = [targets]\n    if isinstance(targets, list):\n        for name in targets:\n            if not isinstance(name, str):\n                raise TypeError(f\"expected str, not {type(name)}\")\n            if name not in T.columns:\n                raise ValueError(f\"target item {name} not a column name in T.columns:\\n{T.columns}\")\n    else:\n        raise TypeError(\"Expected source as list of column names\")\n\n    if missing is None:\n        missing = {None}\n    else:\n        missing = set(missing)\n\n    if method == \"nearest neighbour\":\n        if sources in (None, []):\n            sources = list(T.columns)\n        if isinstance(sources, str):\n            sources = [sources]\n        if isinstance(sources, list):\n            for name in sources:\n                if not isinstance(name, str):\n                    raise TypeError(f\"expected str, not {type(name)}\")\n                if name not in T.columns:\n                    raise ValueError(f\"source item {name} not a column name in T.columns:\\n{T.columns}\")\n        else:\n            raise TypeError(\"Expected source as list of column names\")\n\n    methods = [\"nearest neighbour\", \"mean\", \"mode\", \"carry forward\"]\n\n    if method == \"carry forward\":\n        return carry_forward(T, targets, missing, tqdm=tqdm, pbar=pbar)\n    elif method in {\"mean\", \"mode\"}:\n        return stats_method(T, targets, missing, method, tqdm=tqdm, pbar=pbar)\n    elif method == \"nearest neighbour\":\n        return nearest_neighbour(T, sources, missing, targets, tqdm=tqdm)\n    else:\n        raise ValueError(f\"method {method} not recognised amonst known methods: {list(methods)})\")\n</code></pre>"},{"location":"reference/imputation/#tablite.imputation.carry_forward","title":"<code>tablite.imputation.carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/imputation.py</code> <pre><code>def carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None):\n    assert isinstance(missing, set)\n\n    if pbar is None:\n        total = len(targets) * len(T)\n        pbar = tqdm(total=total, desc=\"imputation.carry_forward\", disable=Config.TQDM_DISABLE)\n\n    new = T.copy()\n    for name in T.columns:\n        if name in targets:\n            data = T[name][:]  # create copy\n            last_value = None\n            for ix, v in enumerate(data):\n                if v in missing:  # perform replacement\n                    data[ix] = last_value\n                else:  # keep last value.\n                    last_value = v\n                pbar.update(1)\n            new[name] = data\n        else:\n            new[name] = T[name]\n\n    return new\n</code></pre>"},{"location":"reference/imputation/#tablite.imputation.stats_method","title":"<code>tablite.imputation.stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/imputation.py</code> <pre><code>def stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None):\n    assert isinstance(missing, set)\n\n    if pbar is None:\n        total = len(targets)\n        pbar = tqdm(total=total, desc=f\"imputation.{method}\", disable=Config.TQDM_DISABLE)\n\n    new = T.copy()\n    for name in T.columns:\n        if name in targets:\n            col = T.columns[name]\n            assert isinstance(col, Column)\n\n            hist_values, hist_counts = col.histogram()\n\n            for m in missing:\n                try:\n                    idx = hist_values.index(m)\n                    hist_counts[idx] = 0\n                except ValueError:\n                    pass\n\n            stats = summary_statistics(hist_values, hist_counts)\n\n            new_value = stats[method]\n            col.replace(mapping={m: new_value for m in missing})\n            new[name] = col\n            pbar.update(1)\n        else:\n            new[name] = T[name]  # no entropy, keep as is.\n\n    return new\n</code></pre>"},{"location":"reference/imputation/#tablite.imputation-modules","title":"Modules","text":""},{"location":"reference/joins/","title":"Joins","text":""},{"location":"reference/joins/#tablite.joins","title":"<code>tablite.joins</code>","text":""},{"location":"reference/joins/#tablite.joins-classes","title":"Classes","text":""},{"location":"reference/joins/#tablite.joins-functions","title":"Functions","text":""},{"location":"reference/joins/#tablite.joins.join","title":"<code>tablite.joins.join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], kind: str = 'inner', merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"<p>short-cut for all join functions.</p> PARAMETER  DESCRIPTION <code>T</code> <p>left table</p> <p> TYPE: <code>Table</code> </p> <code>other</code> <p>right table</p> <p> TYPE: <code>Table</code> </p> <code>left_keys</code> <p>list of keys for the join from left table.</p> <p> TYPE: <code>list</code> </p> <code>right_keys</code> <p>list of keys for the join from right table.</p> <p> TYPE: <code>list</code> </p> <code>left_columns</code> <p>list of columns names to retain from left table. If None, all are retained.</p> <p> TYPE: <code>list</code> </p> <code>right_columns</code> <p>list of columns names to retain from right table. If None, all are retained.</p> <p> TYPE: <code>list</code> </p> <code>kind</code> <p>'inner', 'left', 'outer', 'cross'. Defaults to \"inner\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'inner'</code> </p> <code>tqdm</code> <p>tqdm progress counter. Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> <code>pbar</code> <p>tqdm.progressbar. Defaults to None.</p> <p> TYPE: <code>pbar</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>ValueError</code> <p>if join type is unknown.</p> RETURNS DESCRIPTION <code>Table</code> <p>joined table.</p> <p>Example: \"inner\"</p> <pre><code>SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n</code></pre> <p>Tablite: </p> <pre><code>&gt;&gt;&gt; inner_join = numbers.inner_join(\n    letters, \n    left_keys=['colour'], \n    right_keys=['color'], \n    left_columns=['number'], \n    right_columns=['letter']\n)\n</code></pre> <p>Example: \"left\" </p> <pre><code>SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n</code></pre> <p>Tablite: </p> <pre><code>&gt;&gt;&gt; left_join = numbers.left_join(\n    letters, \n    left_keys=['colour'], \n    right_keys=['color'], \n    left_columns=['number'], \n    right_columns=['letter']\n)\n</code></pre> <p>Example: \"outer\"</p> <pre><code>SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n</code></pre> <p>Tablite: </p> <pre><code>&gt;&gt;&gt; outer_join = numbers.outer_join(\n    letters, \n    left_keys=['colour'], \n    right_keys=['color'], \n    left_columns=['number'], \n    right_columns=['letter']\n    )\n</code></pre> <p>Example: \"cross\"</p> <p>CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table</p> Source code in <code>tablite/joins.py</code> <pre><code>def join(\n    T: BaseTable,\n    other: BaseTable,\n    left_keys: List[str],\n    right_keys: List[str],\n    left_columns: Union[List[str], None],\n    right_columns: Union[List[str], None],\n    kind: str = \"inner\",\n    merge_keys: bool = False,\n    tqdm=_tqdm,\n    pbar=None,\n):\n    \"\"\"short-cut for all join functions.\n\n    Args:\n        T (Table): left table\n        other (Table): right table\n        left_keys (list): list of keys for the join from left table.\n        right_keys (list): list of keys for the join from right table.\n        left_columns (list): list of columns names to retain from left table.\n            If None, all are retained.\n        right_columns (list): list of columns names to retain from right table.\n            If None, all are retained.\n        kind (str, optional): 'inner', 'left', 'outer', 'cross'. Defaults to \"inner\".\n        tqdm (tqdm, optional): tqdm progress counter. Defaults to _tqdm.\n        pbar (tqdm.pbar, optional): tqdm.progressbar. Defaults to None.\n\n    Raises:\n        ValueError: if join type is unknown.\n\n    Returns:\n        Table: joined table.\n\n    Example: \"inner\"\n    ```\n    SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n    ```\n    Tablite: \n    ```\n    &gt;&gt;&gt; inner_join = numbers.inner_join(\n        letters, \n        left_keys=['colour'], \n        right_keys=['color'], \n        left_columns=['number'], \n        right_columns=['letter']\n    )\n    ```\n\n    Example: \"left\" \n    ```\n    SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n    ```\n    Tablite: \n    ```\n    &gt;&gt;&gt; left_join = numbers.left_join(\n        letters, \n        left_keys=['colour'], \n        right_keys=['color'], \n        left_columns=['number'], \n        right_columns=['letter']\n    )\n    ```\n\n    Example: \"outer\"\n    ```\n    SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n    ```\n\n    Tablite: \n    ```\n    &gt;&gt;&gt; outer_join = numbers.outer_join(\n        letters, \n        left_keys=['colour'], \n        right_keys=['color'], \n        left_columns=['number'], \n        right_columns=['letter']\n        )\n    ```\n\n    Example: \"cross\"\n\n    CROSS JOIN returns the Cartesian product of rows from tables in the join.\n    In other words, it will produce rows which combine each row from the first table\n    with each row from the second table\n    \"\"\"\n    if left_columns is None:\n        left_columns = list(T.columns)\n    if right_columns is None:\n        right_columns = list(other.columns)\n    assert merge_keys in {True,False}\n\n    _jointype_check(T, other, left_keys, right_keys, left_columns, right_columns)\n\n    return _join(kind, T,other,left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys,\n             tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.inner_join","title":"<code>tablite.joins.inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"inner\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.left_join","title":"<code>tablite.joins.left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"left\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.outer_join","title":"<code>tablite.joins.outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"outer\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.cross_join","title":"<code>tablite.joins.cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"cross\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/lookup/","title":"Lookup","text":""},{"location":"reference/lookup/#tablite.lookup","title":"<code>tablite.lookup</code>","text":""},{"location":"reference/lookup/#tablite.lookup-attributes","title":"Attributes","text":""},{"location":"reference/lookup/#tablite.lookup-classes","title":"Classes","text":""},{"location":"reference/lookup/#tablite.lookup-functions","title":"Functions","text":""},{"location":"reference/lookup/#tablite.lookup.lookup","title":"<code>tablite.lookup.lookup(T, other, *criteria, all=True, tqdm=_tqdm)</code>","text":"<p>function for looking up values in <code>other</code> according to criteria in ascending order. :param: T: Table  :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form:     (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=ANY</p> <p>OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare.</p> <p>Examples:</p> <p>comparison of two columns:</p> <pre><code>('column A', \"==\", 'column B')\n</code></pre> <p>compare value from column 'Date' with date 24/12.</p> <pre><code>('Date', \"&lt;\", DataTypes.date(24,12) )\n</code></pre> <p>uses custom function to compare value from column 'text 1' with value from column 'text 2'</p> <pre><code>f = lambda L,R: all( ord(L) &lt; ord(R) )\n('text 1', f, 'text 2')\n</code></pre> Source code in <code>tablite/lookup.py</code> <pre><code>def lookup(T, other, *criteria, all=True, tqdm=_tqdm):\n    \"\"\"function for looking up values in `other` according to criteria in ascending order.\n    :param: T: Table \n    :param: other: Table sorted in ascending search order.\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n        (LEFT, OPERATOR, RIGHT)\n    :param: all: boolean: True=ALL, False=ANY\n\n    OPERATOR must be a callable that returns a boolean\n    LEFT must be a value that the OPERATOR can compare.\n    RIGHT must be a value that the OPERATOR can compare.\n\n    Examples:\n        comparison of two columns:\n\n            ('column A', \"==\", 'column B')\n\n        compare value from column 'Date' with date 24/12.\n\n            ('Date', \"&lt;\", DataTypes.date(24,12) )\n\n        uses custom function to compare value from column\n        'text 1' with value from column 'text 2'\n\n            f = lambda L,R: all( ord(L) &lt; ord(R) )\n            ('text 1', f, 'text 2')\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    sub_cls_check(other, BaseTable)\n\n    all = all\n    any = not all\n\n    ops = lookup_ops\n\n    functions, left_criteria, right_criteria = [], set(), set()\n\n    for left, op, right in criteria:\n        left_criteria.add(left)\n        right_criteria.add(right)\n        if callable(op):\n            pass  # it's a custom function.\n        else:\n            op = ops.get(op, None)\n            if not callable(op):\n                raise ValueError(f\"{op} not a recognised operator for comparison.\")\n\n        functions.append((op, left, right))\n    left_columns = [n for n in left_criteria if n in T.columns]\n    right_columns = [n for n in right_criteria if n in other.columns]\n\n    result_index = np.empty(shape=(len(T)), dtype=np.int64)\n    cache = {}\n    left = T[left_columns]\n    Constr = type(T)\n    if isinstance(left, Column):\n        tmp, left = left, Constr()\n        left[left_columns[0]] = tmp\n    right = other[right_columns]\n    if isinstance(right, Column):\n        tmp, right = right, Constr()\n        right[right_columns[0]] = tmp\n    assert isinstance(left, BaseTable)\n    assert isinstance(right, BaseTable)\n\n    for ix, row1 in tqdm(enumerate(left.rows), total=len(T), disable=Config.TQDM_DISABLE):\n        row1_tup = tuple(row1)\n        row1d = {name: value for name, value in zip(left_columns, row1)}\n        row1_hash = hash(row1_tup)\n\n        match_found = True if row1_hash in cache else False\n\n        if not match_found:  # search.\n            for row2ix, row2 in enumerate(right.rows):\n                row2d = {name: value for name, value in zip(right_columns, row2)}\n\n                evaluations = {op(row1d.get(left, left), row2d.get(right, right)) for op, left, right in functions}\n                # The evaluations above does a neat trick:\n                # as L is a dict, L.get(left, L) will return a value\n                # from the columns IF left is a column name. If it isn't\n                # the function will treat left as a value.\n                # The same applies to right.\n                all_ = all and (False not in evaluations)\n                any_ = any and True in evaluations\n                if all_ or any_:\n                    match_found = True\n                    cache[row1_hash] = row2ix\n                    break\n\n        if not match_found:  # no match found.\n            cache[row1_hash] = -1  # -1 is replacement for None in the index as numpy can't handle Nones.\n\n        result_index[ix] = cache[row1_hash]\n\n    f = select_processing_method(2 * max(len(T), len(other)), _sp_lookup, _mp_lookup)\n    return f(T, other, result_index)\n</code></pre>"},{"location":"reference/match/","title":"Match","text":""},{"location":"reference/match/#tablite.match","title":"<code>tablite.match</code>","text":""},{"location":"reference/match/#tablite.match-classes","title":"Classes","text":""},{"location":"reference/match/#tablite.match-functions","title":"Functions","text":""},{"location":"reference/match/#tablite.match.match","title":"<code>tablite.match.match(T, other, *criteria, keep_left=None, keep_right=None)</code>","text":"<p>performs inner join where <code>T</code> matches <code>other</code> and removes rows that do not match.</p> <p>:param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form:</p> <pre><code>(LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n    ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n</code></pre> <p>:param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep.</p> Source code in <code>tablite/match.py</code> <pre><code>def match(T, other, *criteria, keep_left=None, keep_right=None):  # lookup and filter combined - drops unmatched rows.\n    \"\"\"\n    performs inner join where `T` matches `other` and removes rows that do not match.\n\n    :param: T: Table\n    :param: other: Table\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n        (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n        Example:\n            ('column A', \"==\", 'column B')\n\n        This syntax follows the lookup syntax. See Lookup for details.\n\n    :param: keep_left: list of columns to keep.\n    :param: keep_right: list of right columns to keep.\n    \"\"\"\n    assert isinstance(T, BaseTable)\n    assert isinstance(other, BaseTable)\n    if keep_left is None:\n        keep_left = [n for n in T.columns]\n    else:\n        type_check(keep_left, list)\n        name_check(T.columns, *keep_left)\n\n    if keep_right is None:\n        keep_right = [n for n in other.columns]\n    else:\n        type_check(keep_right, list)\n        name_check(other.columns, *keep_right)\n\n    indices = np.full(shape=(len(T),), fill_value=-1, dtype=np.int64)\n    for arg in criteria:\n        b,_,a = arg\n        if _ != \"==\":\n            raise ValueError(\"match requires A == B. For other logic visit `lookup`\")\n        if b not in T.columns:\n            raise ValueError(f\"Column {b} not found in T for criteria: {arg}\")\n        if a not in other.columns:\n            raise ValueError(f\"Column {a} not found in T for criteria: {arg}\")\n\n        index_update = find_indices(other[a][:], T[b][:], fill_value=-1)\n        indices = merge_indices(indices, index_update)\n\n    cls = type(T)\n    new = cls()\n    for name in T.columns:\n        if name in keep_left:\n            new[name] = np.compress(indices != -1, T[name][:])\n\n    for name in other.columns:\n        if name in keep_right:\n            new_name = unique_name(name, new.columns)\n            primary = np.compress(indices != -1, indices)\n            new[new_name] = np.take(other[name][:], primary)\n\n    return new\n</code></pre>"},{"location":"reference/match/#tablite.match.find_indices","title":"<code>tablite.match.find_indices(x, y, fill_value=-1)</code>","text":"<p>finds index of y in x</p> Source code in <code>tablite/match.py</code> <pre><code>def find_indices(x,y, fill_value=-1):  # fast.\n    \"\"\"\n    finds index of y in x\n    \"\"\"\n    # disassembly of numpy:\n    # import numpy as np\n    # x = np.array([3, 5, 7,  1,   9, 8, 6, 6])\n    # y = np.array([2, 1, 5, 10, 100, 6])\n    index = np.argsort(x)  # array([3, 0, 1, 6, 7, 2, 5, 4])\n    sorted_x = x[index]  # array([1, 3, 5, 6, 6, 7, 8, 9])\n    sorted_index = np.searchsorted(sorted_x, y)  # array([1, 0, 2, 8, 8, 3])\n    yindex = np.take(index, sorted_index, mode=\"clip\")  # array([0, 3, 1, 4, 4, 6])\n    mask = x[yindex] != y  # array([ True, False, False,  True,  True, False])\n    indices = np.ma.array(yindex, mask=mask, fill_value=fill_value)  \n    # masked_array(data=[--, 3, 1, --, --, 6], mask=[ True, False, False,  True,  True, False], fill_value=999999)\n    # --: y[0] not in x\n    # 3 : y[1] == x[3]\n    # 1 : y[2] == x[1]\n    # --: y[3] not in x\n    # --: y[4] not in x\n    # --: y[5] == x[6]\n    result = np.where(~indices.mask, indices.data, -1)  \n    return result  # array([-1,  3,  1, -1, -1,  6])\n</code></pre>"},{"location":"reference/match/#tablite.match.merge_indices","title":"<code>tablite.match.merge_indices(x1, *args, fill_value=-1)</code>","text":"<p>merges x1 and x2 where</p> Source code in <code>tablite/match.py</code> <pre><code>def merge_indices(x1, *args, fill_value=-1):\n    \"\"\"\n    merges x1 and x2 where \n    \"\"\"\n    # dis:\n    # &gt;&gt;&gt; AA = array([-1,  3, -1, 5])\n    # &gt;&gt;&gt; BB = array([-1, -1,  4, 5])\n    new = x1[:]  # = AA\n    for arg in args:\n        mask = (new == fill_value)  # array([True, False, True, False])\n        new = np.where(mask, arg, new)  # array([-1, 3, 4, 5])\n    return new   # array([-1, 3, 4, 5])\n</code></pre>"},{"location":"reference/merge/","title":"Merge","text":""},{"location":"reference/merge/#tablite.merge","title":"<code>tablite.merge</code>","text":""},{"location":"reference/merge/#tablite.merge-classes","title":"Classes","text":""},{"location":"reference/merge/#tablite.merge-functions","title":"Functions","text":""},{"location":"reference/merge/#tablite.merge.where","title":"<code>tablite.merge.where(T, criteria, left, right, new)</code>","text":"<p>takes from LEFT where criteria is True else RIGHT  and creates a single new column.</p> <p>:param: T: Table :param: criteria: np.array(bool):          if True take left column         else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name</p> <p>:returns: T</p> Source code in <code>tablite/merge.py</code> <pre><code>def where(T, criteria, left, right, new):\n    \"\"\" takes from LEFT where criteria is True else RIGHT \n    and creates a single new column.\n\n    :param: T: Table\n    :param: criteria: np.array(bool): \n            if True take left column\n            else take right column\n    :param left: (str) column name\n    :param right: (str) column name\n    :param new: (str) new name\n\n    :returns: T\n    \"\"\"\n    type_check(T, BaseTable)\n    if isinstance(criteria, np.ndarray):\n        if not criteria.dtype == \"bool\":\n            raise TypeError\n    else:\n        criteria = np.array(criteria, dtype='bool')\n\n    new_uq = unique_name(new, list(T.columns))\n    T.add_column(new_uq)\n    col = T[new_uq]\n\n    for start,end in Config.page_steps(len(criteria)):\n        left_values = T[left][start:end]\n        right_values = T[right][start:end]\n        new_values = np.where(criteria, left_values, right_values)\n        col.extend(new_values)\n\n    if new == right:\n        T[right] = T[new_uq]  # keep column order\n        del T[new_uq]\n        del T[left]\n    elif new == left:\n        T[left] = T[new_uq]  # keep column order\n        del T[new_uq]\n        del T[right]\n    else:\n        T[new] = T[new_uq]\n        del T[left]\n        del T[right]\n    return T\n</code></pre>"},{"location":"reference/mp_utils/","title":"Mp utils","text":""},{"location":"reference/mp_utils/#tablite.mp_utils","title":"<code>tablite.mp_utils</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-attributes","title":"Attributes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.lookup_ops","title":"<code>tablite.mp_utils.lookup_ops = {'in': _in, 'not in': not_in, '&lt;': operator.lt, '&lt;=': operator.le, '&gt;': operator.gt, '&gt;=': operator.ge, '!=': operator.ne, '==': operator.eq}</code>  <code>module-attribute</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops","title":"<code>tablite.mp_utils.filter_ops = {'&gt;': operator.gt, '&gt;=': operator.ge, '==': operator.eq, '&lt;': operator.lt, '&lt;=': operator.le, '!=': operator.ne, 'in': _in}</code>  <code>module-attribute</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops_from_text","title":"<code>tablite.mp_utils.filter_ops_from_text = {'gt': '&gt;', 'gteq': '&gt;=', 'eq': '==', 'lt': '&lt;', 'lteq': '&lt;=', 'neq': '!=', 'in': _in}</code>  <code>module-attribute</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-classes","title":"Classes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-functions","title":"Functions","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.not_in","title":"<code>tablite.mp_utils.not_in(a, b)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def not_in(a, b):\n    return not operator.contains(str(a), str(b))\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.is_mp","title":"<code>tablite.mp_utils.is_mp(fields: int) -&gt; bool</code>","text":"PARAMETER  DESCRIPTION <code>fields</code> <p>number of fields</p> <p> TYPE: <code>int</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>bool</p> Source code in <code>tablite/mp_utils.py</code> <pre><code>def is_mp(fields: int) -&gt; bool:\n    \"\"\"\n\n    Args:\n        fields (int): number of fields\n\n    Returns:\n        bool\n    \"\"\"\n    if Config.MULTIPROCESSING_MODE == Config.FORCE:\n        return True\n\n    if Config.MULTIPROCESSING_MODE == Config.FALSE:\n        return False\n\n    if fields &lt; Config.SINGLE_PROCESSING_LIMIT:\n        return False\n\n    if max(psutil.cpu_count(logical=False), 1) &lt; 2:\n        return False\n\n    return True\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.select_processing_method","title":"<code>tablite.mp_utils.select_processing_method(fields, sp, mp)</code>","text":"PARAMETER  DESCRIPTION <code>fields</code> <p>number of fields</p> <p> TYPE: <code>int</code> </p> <code>sp</code> <p>method for single processing</p> <p> TYPE: <code>callable</code> </p> <code>mp</code> <p>method for multiprocessing</p> <p> TYPE: <code>callable</code> </p> RETURNS DESCRIPTION <code>_type_</code> <p>description</p> Source code in <code>tablite/mp_utils.py</code> <pre><code>def select_processing_method(fields, sp, mp):\n    \"\"\"\n\n    Args:\n        fields (int): number of fields\n        sp (callable): method for single processing\n        mp (callable): method for multiprocessing\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    return mp if is_mp(fields) else sp\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.maskify","title":"<code>tablite.mp_utils.maskify(arr)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def maskify(arr):\n    none_mask = [False] * len(arr)  # Setting the default\n\n    for i in range(len(arr)):\n        if arr[i] is None:  # Check if our value is None\n            none_mask[i] = True\n            arr[i] = 0  # Remove None from the original array\n\n    return none_mask\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.share_mem","title":"<code>tablite.mp_utils.share_mem(inp_arr, dtype)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def share_mem(inp_arr, dtype):\n    len_ = len(inp_arr)\n    size = np.dtype(dtype).itemsize * len_\n    shape = (len_,)\n\n    out_shm = shared_memory.SharedMemory(create=True, size=size)  # the co_processors will read this.\n    out_arr_index = np.ndarray(shape, dtype=dtype, buffer=out_shm.buf)\n    out_arr_index[:] = inp_arr\n\n    return out_arr_index, out_shm\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.map_task","title":"<code>tablite.mp_utils.map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end):\n    # connect\n    shared_data = shared_memory.SharedMemory(name=data_shm_name)\n    data = np.ndarray(shape, dtype=dtype, buffer=shared_data.buf)\n\n    shared_index = shared_memory.SharedMemory(name=index_shm_name)\n    index = np.ndarray(shape, dtype=np.int64, buffer=shared_index.buf)\n\n    shared_target = shared_memory.SharedMemory(name=destination_shm_name)\n    target = np.ndarray(shape, dtype=dtype, buffer=shared_target.buf)\n    # work\n    target[start:end] = np.take(data[start:end], index[start:end])\n    # disconnect\n    shared_data.close()\n    shared_index.close()\n    shared_target.close()\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.reindex_task","title":"<code>tablite.mp_utils.reindex_task(src, dst, index_shm, shm_shape, start, end)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def reindex_task(src, dst, index_shm, shm_shape, start, end):\n    # connect\n    existing_shm = shared_memory.SharedMemory(name=index_shm)\n    shared_index = np.ndarray(shm_shape, dtype=np.int64, buffer=existing_shm.buf)\n    # work\n    array = load_numpy(src)\n    new = np.take(array, shared_index[start:end])\n    np.save(dst, new, allow_pickle=True, fix_imports=False)\n    # disconnect\n    existing_shm.close()\n</code></pre>"},{"location":"reference/nimlite/","title":"Nimlite","text":""},{"location":"reference/nimlite/#tablite.nimlite","title":"<code>tablite.nimlite</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite-attributes","title":"Attributes","text":""},{"location":"reference/nimlite/#tablite.nimlite.paths","title":"<code>tablite.nimlite.paths = sys.argv[:]</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.K","title":"<code>tablite.nimlite.K = TypeVar('K', bound=BaseTable)</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidEncoders","title":"<code>tablite.nimlite.ValidEncoders = Literal['ENC_UTF8', 'ENC_UTF16', 'ENC_WIN1250']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidQuoting","title":"<code>tablite.nimlite.ValidQuoting = Literal['QUOTE_MINIMAL', 'QUOTE_ALL', 'QUOTE_NONNUMERIC', 'QUOTE_NONE', 'QUOTE_STRINGS', 'QUOTE_NOTNULL']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidSkipEmpty","title":"<code>tablite.nimlite.ValidSkipEmpty = Literal['NONE', 'ANY', 'ALL']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ColumnSelectorDict","title":"<code>tablite.nimlite.ColumnSelectorDict = TypedDict('ColumnSelectorDict', {'column': str, 'type': Literal['int', 'float', 'bool', 'str', 'date', 'time', 'datetime'], 'allow_empty': Union[bool, None], 'rename': Union[str, None]})</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterCriteria","title":"<code>tablite.nimlite.FilterCriteria = Literal['&gt;', '&gt;=', '==', '&lt;', '&lt;=', '!=', 'in']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterType","title":"<code>tablite.nimlite.FilterType = Literal['all', 'any']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterDict","title":"<code>tablite.nimlite.FilterDict = TypedDict('FilterDict', {'column1': str, 'value1': Union[str, None], 'criteria': FilterCriteria, 'column2': str, 'value2': Union[str, None]})</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite-classes","title":"Classes","text":""},{"location":"reference/nimlite/#tablite.nimlite-functions","title":"Functions","text":""},{"location":"reference/nimlite/#tablite.nimlite.get_headers","title":"<code>tablite.nimlite.get_headers(path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, header_row_index: int = 0, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, linecount: int = 10) -&gt; list[list[str]]</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def get_headers(\n    path: Union[str, Path],\n    encoding: ValidEncoders =\"ENC_UTF8\",\n    *,\n    header_row_index: int=0,\n    newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n    quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True,\n    linecount: int = 10\n) -&gt; list[list[str]]:\n    return nl.get_headers(\n            path=str(path),\n            encoding=encoding,\n            newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n            strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n            header_row_index=header_row_index,\n            quoting=quoting,\n            linecount=linecount\n        )\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.text_reader","title":"<code>tablite.nimlite.text_reader(T: Type[K], pid: str, path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, first_row_has_headers: bool = True, header_row_index: int = 0, columns: List[Union[str, None]] = None, start: Union[str, None] = None, limit: Union[str, None] = None, guess_datatypes: bool = False, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -&gt; K</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def text_reader(\n    T: Type[K],\n    pid: str, path: Union[str, Path],\n    encoding: ValidEncoders =\"ENC_UTF8\",\n    *,\n    first_row_has_headers: bool=True, header_row_index: int=0,\n    columns: List[Union[str, None]]=None,\n    start: Union[str, None] = None, limit: Union[str, None]=None,\n    guess_datatypes: bool =False,\n    newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n    quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True, skip_empty: ValidSkipEmpty = \"NONE\",\n    tqdm=_tqdm\n) -&gt; K:\n    assert isinstance(path, Path)\n    assert isinstance(pid, Path)\n    with tqdm(total=10, desc=f\"importing file\") as pbar:\n        table = nl.text_reader(\n            pid=str(pid),\n            path=str(path),\n            encoding=encoding,\n            first_row_has_headers=first_row_has_headers, header_row_index=header_row_index,\n            columns=columns,\n            start=start, limit=limit,\n            guess_datatypes=guess_datatypes,\n            newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n            quoting=quoting,\n            strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n            skip_empty=skip_empty,\n            page_size=Config.PAGE_SIZE\n        )\n\n        pbar.update(1)\n\n        task_info = table[\"task\"]\n        task_columns = table[\"columns\"]\n\n        ti_tasks = task_info[\"tasks\"]\n        ti_import_field_names = task_info[\"import_field_names\"]\n\n        is_windows = platform.system() == \"Windows\"\n        use_logical = False if is_windows else True\n\n        cpus = max(psutil.cpu_count(logical=use_logical), 1)\n\n        pbar_step = 4 / max(len(ti_tasks), 1)\n\n        class WrapUpdate:\n            def update(self, n):\n                pbar.update(n * pbar_step)\n\n        wrapped_pbar = WrapUpdate()\n\n        def next_task(task: Task, page_info):\n            wrapped_pbar.update(1)\n            return Task(\n                nl.text_reader_task,\n                *task.args, **task.kwargs, page_info=page_info\n            )\n\n        tasks = [\n            TaskChain(\n                Task(\n                    nl.collect_text_reader_page_info_task,\n                    task=t,\n                    task_info=task_info\n                ), next_task=next_task\n            ) for t in ti_tasks\n        ]\n\n        is_sp = False\n\n        if Config.MULTIPROCESSING_MODE == Config.FALSE:\n            is_sp = True\n        elif Config.MULTIPROCESSING_MODE == Config.FORCE:\n            is_sp = False\n        elif Config.MULTIPROCESSING_MODE == Config.AUTO and cpus &lt;= 1 or len(tasks) &lt;= 1:\n            is_sp = True\n\n        if is_sp:\n            res = []\n\n            for task in tasks:\n                page = task.execute()\n\n                res.append(page)\n        else:\n            with TaskManager(cpus, error_mode=\"exception\") as tm:\n                res = tm.execute(tasks, pbar=wrapped_pbar)\n\n        col_path = pid\n        column_dict = {\n            cols: Column(col_path)\n            for cols in ti_import_field_names\n        }\n\n        for res_pages in res:\n            col_map = {\n                n: res_pages[i]\n                for i, n in enumerate(ti_import_field_names)\n            }\n\n            for k, c in column_dict.items():\n                c.pages.append(col_map[k])\n\n        if columns is None:\n            columns = [c[\"name\"] for c in task_columns]\n\n        table_dict = {\n            a[\"name\"]: column_dict[b]\n            for a, b in zip(task_columns, columns)\n        }\n\n        pbar.update(pbar.total - pbar.n)\n\n        table = T(columns=table_dict)\n\n    return table\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.wrap","title":"<code>tablite.nimlite.wrap(str_: str) -&gt; str</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def wrap(str_: str) -&gt; str:\n    return '\"' + str_.replace('\"', '\\\\\"').replace(\"'\", \"\\\\'\").replace(\"\\n\", \"\\\\n\").replace(\"\\t\", \"\\\\t\") + '\"'\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.column_select","title":"<code>tablite.nimlite.column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -&gt; Tuple[K, K]</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -&gt; Tuple[K, K]:\n    with tqdm(total=100, desc=\"column select\", bar_format='{desc}: {percentage:.1f}%|{bar}{r_bar}') as pbar:\n        T = type(table)\n        dir_pid = Config.workdir / Config.pid\n\n        col_infos = nl.collect_column_select_info(table, cols, str(dir_pid), pbar)\n\n        columns = col_infos[\"columns\"]\n        page_count = col_infos[\"page_count\"]\n        is_correct_type = col_infos[\"is_correct_type\"]\n        desired_column_map = col_infos[\"desired_column_map\"]\n        original_pages_map = col_infos[\"original_pages_map\"]\n        passed_column_data = col_infos[\"passed_column_data\"]\n        failed_column_data = col_infos[\"failed_column_data\"]\n        res_cols_pass = col_infos[\"res_cols_pass\"]\n        res_cols_fail = col_infos[\"res_cols_fail\"]\n        column_names = col_infos[\"column_names\"]\n        reject_reason_name = col_infos[\"reject_reason_name\"]\n\n        if all(is_correct_type.values()):\n            tbl_pass_columns = {\n                desired_name: table[desired_info[0]]\n                for desired_name, desired_info in desired_column_map.items()\n            }\n\n            tbl_fail_columns = {\n                desired_name: []\n                for desired_name in failed_column_data\n            }\n\n            tbl_pass = T(columns=tbl_pass_columns)\n            tbl_fail = T(columns=tbl_fail_columns)\n\n            return (tbl_pass, tbl_fail)\n\n        task_list_inp = (\n            _collect_cs_info(i, columns, res_cols_pass, res_cols_fail, original_pages_map)\n            for i in range(page_count)\n        )\n\n        page_size = Config.PAGE_SIZE\n\n        tasks = (\n            Task(\n                nl.do_slice_convert, str(dir_pid), page_size, columns, reject_reason_name, res_pass, res_fail, desired_column_map, column_names, is_correct_type\n            )\n            for columns, res_pass, res_fail in task_list_inp\n        )\n\n        cpu_count = max(psutil.cpu_count(), 1)\n\n        if Config.MULTIPROCESSING_MODE == Config.FORCE:\n            is_mp = True\n        elif Config.MULTIPROCESSING_MODE == Config.FALSE:\n            is_mp = False\n        elif Config.MULTIPROCESSING_MODE == Config.AUTO:\n            is_multithreaded = cpu_count &gt; 1\n            is_multipage = page_count &gt; 1\n\n            is_mp = is_multithreaded and is_multipage\n\n        tbl_pass = T({k: [] for k in passed_column_data})\n        tbl_fail = T({k: [] for k in failed_column_data})\n\n        converted = []\n        step_size = 45 / max(page_count, 1)\n\n        if is_mp:\n            class WrapUpdate:\n                def update(self, n):\n                    pbar.update(n * step_size)\n\n            with TaskManager(min(cpu_count, page_count), error_mode=\"exception\") as tm:\n                res = tm.execute(list(tasks), pbar=WrapUpdate())\n\n                converted.extend(res)\n        else:\n            for task in tasks:\n                res = task.f(*task.args, **task.kwargs)\n\n                converted.append(res)\n                pbar.update(step_size)\n\n        def extend_table(table, columns):\n            for (col_name, pg) in columns:\n                table[col_name].pages.append(pg)\n\n        for pg_pass, pg_fail in converted:\n            extend_table(tbl_pass, pg_pass)\n            extend_table(tbl_fail, pg_fail)\n\n        pbar.update(pbar.total - pbar.n)\n\n        return tbl_pass, tbl_fail\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.read_page","title":"<code>tablite.nimlite.read_page(path: Union[str, Path]) -&gt; np.ndarray</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def read_page(path: Union[str, Path]) -&gt; np.ndarray:\n    return nl.read_page(str(path))\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.repaginate","title":"<code>tablite.nimlite.repaginate(column: Column)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def repaginate(column: Column):\n    nl.repaginate(column)\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.nearest_neighbour","title":"<code>tablite.nimlite.nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm):\n    return nl.nearest_neighbour(T, sources, list(missing), targets, tqdm)\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.groupby","title":"<code>tablite.nimlite.groupby(T, keys, functions, tqdm=_tqdm)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def groupby(T, keys, functions, tqdm=_tqdm):\n    return nl.groupby(T, keys, functions, tqdm)\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.filter","title":"<code>tablite.nimlite.filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm=_tqdm)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm = _tqdm):\n    return nl.filter(table, expressions, type, tqdm)\n</code></pre>"},{"location":"reference/pivots/","title":"Pivots","text":""},{"location":"reference/pivots/#tablite.pivots","title":"<code>tablite.pivots</code>","text":""},{"location":"reference/pivots/#tablite.pivots-classes","title":"Classes","text":""},{"location":"reference/pivots/#tablite.pivots-functions","title":"Functions","text":""},{"location":"reference/pivots/#tablite.pivots.pivot","title":"<code>tablite.pivots.pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None)</code>","text":"<p>param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as</p> <p>example:</p> <pre><code>&gt;&gt;&gt; t.show()\n+=====+=====+=====+\n|  A  |  B  |  C  |\n| int | int | int |\n+-----+-----+-----+\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n+=====+=====+=====+\n\n&gt;&gt;&gt; t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n&gt;&gt;&gt; t2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int|  str   |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0  |  6|Sum(B)  |    2|None |None |\n|1  |  5|Sum(B)  |    4|None |None |\n|2  |  4|Sum(B)  |None |    6|None |\n|3  |  3|Sum(B)  |None |    8|None |\n|4  |  2|Sum(B)  |None |None |   10|\n|5  |  1|Sum(B)  |None |None |   12|\n+===+===+========+=====+=====+=====+\n</code></pre> Source code in <code>tablite/pivots.py</code> <pre><code>def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    param: rows: column names to keep as rows\n    param: columns: column names to keep as columns\n    param: functions: aggregation functions from the Groupby class as\n\n    example:\n    ```\n    &gt;&gt;&gt; t.show()\n    +=====+=====+=====+\n    |  A  |  B  |  C  |\n    | int | int | int |\n    +-----+-----+-----+\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    +=====+=====+=====+\n\n    &gt;&gt;&gt; t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n    &gt;&gt;&gt; t2.show()\n    +===+===+========+=====+=====+=====+\n    | # | C |function|(A=1)|(A=2)|(A=3)|\n    |row|int|  str   |mixed|mixed|mixed|\n    +---+---+--------+-----+-----+-----+\n    |0  |  6|Sum(B)  |    2|None |None |\n    |1  |  5|Sum(B)  |    4|None |None |\n    |2  |  4|Sum(B)  |None |    6|None |\n    |3  |  3|Sum(B)  |None |    8|None |\n    |4  |  2|Sum(B)  |None |None |   10|\n    |5  |  1|Sum(B)  |None |None |   12|\n    +===+===+========+=====+=====+=====+\n    ```\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if isinstance(rows, str):\n        rows = [rows]\n    if not all(isinstance(i, str) for i in rows):\n        raise TypeError(f\"Expected rows as a list of column names, not {[i for i in rows if not isinstance(i,str)]}\")\n\n    if isinstance(columns, str):\n        columns = [columns]\n    if not all(isinstance(i, str) for i in columns):\n        raise TypeError(\n            f\"Expected columns as a list of column names, not {[i for i in columns if not isinstance(i, str)]}\"\n        )\n\n    if not isinstance(values_as_rows, bool):\n        raise TypeError(f\"expected sum_on_rows as boolean, not {type(values_as_rows)}\")\n\n    keys = rows + columns\n    assert isinstance(keys, list)\n\n    extra_steps = 2\n\n    if pbar is None:\n        total = extra_steps\n\n        if len(functions) == 0:\n            total = total + len(keys)\n        else:\n            total = total + len(T)\n\n        pbar = tqdm(total=total, desc=\"pivot\")\n\n    grpby = groupby(T, keys, functions, tqdm=tqdm)\n    Constr = type(T)\n\n    if len(grpby) == 0:  # return empty table. This must be a test?\n        pbar.update(extra_steps)\n        return Constr()\n\n    # split keys to determine grid dimensions\n    row_key_index = {}\n    col_key_index = {}\n\n    r = len(rows)\n    c = len(columns)\n    g = len(functions)\n\n    records = defaultdict(dict)\n\n    for row in grpby.rows:\n        row_key = tuple(row[:r])\n        col_key = tuple(row[r : r + c])\n        func_key = tuple(row[r + c :])\n\n        if row_key not in row_key_index:\n            row_key_index[row_key] = len(row_key_index)  # Y\n\n        if col_key not in col_key_index:\n            col_key_index[col_key] = len(col_key_index)  # X\n\n        rix = row_key_index[row_key]\n        cix = col_key_index[col_key]\n        if cix in records:\n            if rix in records[cix]:\n                raise ValueError(\"this should be empty.\")\n        records[cix][rix] = func_key\n\n    pbar.update(1)\n    result = type(T)()\n\n    if values_as_rows:  # ---&gt; leads to more rows.\n        # first create all columns left to right\n\n        n = r + 1  # rows keys + 1 col for function values.\n        cols = [[] for _ in range(n)]\n        for row, ix in row_key_index.items():\n            for col_name, f in functions:\n                cols[-1].append(f\"{f}({col_name})\")\n                for col_ix, v in enumerate(row):\n                    cols[col_ix].append(v)\n\n        for col_name, values in zip(rows + [\"function\"], cols):\n            col_name = unique_name(col_name, result.columns)\n            result[col_name] = values\n        col_length = len(cols[0])\n        cols.clear()\n\n        # then populate the sparse matrix.\n        for col_key, c in col_key_index.items():\n            col_name = \"(\" + \",\".join([f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)]) + \")\"\n            col_name = unique_name(col_name, result.columns)\n            L = [None for _ in range(col_length)]\n            for r, funcs in records[c].items():\n                for ix, f in enumerate(funcs):\n                    L[g * r + ix] = f\n            result[col_name] = L\n\n    else:  # ---&gt; leads to more columns.\n        n = r\n        cols = [[] for _ in range(n)]\n        for row in row_key_index:\n            for col_ix, v in enumerate(row):\n                cols[col_ix].append(v)  # write key columns.\n\n        for col_name, values in zip(rows, cols):\n            result[col_name] = values\n\n        col_length = len(row_key_index)\n\n        # now populate the sparse matrix.\n        for col_key, c in col_key_index.items():  # select column.\n            cols, names = [], []\n\n            for f, v in zip(functions, func_key):\n                agg_col, func = f\n                terms = \",\".join([agg_col] + [f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)])\n                col_name = f\"{func}({terms})\"\n                col_name = unique_name(col_name, result.columns)\n                names.append(col_name)\n                cols.append([None for _ in range(col_length)])\n            for r, funcs in records[c].items():\n                for ix, f in enumerate(funcs):\n                    cols[ix][r] = f\n            for name, col in zip(names, cols):\n                result[name] = col\n\n    pbar.update(1)\n\n    return result\n</code></pre>"},{"location":"reference/pivots/#tablite.pivots.transpose","title":"<code>tablite.pivots.transpose(T, tqdm=_tqdm)</code>","text":"<p>performs a CCW matrix rotation of the table.</p> Source code in <code>tablite/pivots.py</code> <pre><code>def transpose(T, tqdm=_tqdm):\n    \"\"\"performs a CCW matrix rotation of the table.\"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if len(T.columns) == 0:\n        return type(T)()\n\n    assert isinstance(T, BaseTable)\n    new = type(T)()\n    L = list(T.columns)\n    new[L[0]] = L[1:]\n    for row in tqdm(T.rows, desc=\"table transpose\", total=len(T)):\n        new[row[0]] = row[1:]\n    return new\n</code></pre>"},{"location":"reference/pivots/#tablite.pivots.pivot_transpose","title":"<code>tablite.pivots.pivot_transpose(T, columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm)</code>","text":"<p>Transpose a selection of columns to rows.</p> PARAMETER  DESCRIPTION <code>columns</code> <p>column names to transpose</p> <p> TYPE: <code>list of column names</code> </p> <code>keep</code> <p>column names to keep (repeat)</p> <p> TYPE: <code>list of column names</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>with columns transposed to rows</p> Example <p>transpose columns 1,2 and 3 and transpose the remaining columns, except <code>sum</code>.</p> <p>Input:</p> <pre><code>| col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n| 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n| ...  |      |      |     |     |     |     |     |      |\n\n&gt;&gt;&gt; t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun      |   456|\n|1234| 2345| 3456| mon      |   567|\n|1244| 2445| 4456| mon      |     7|\n</code></pre> Source code in <code>tablite/pivots.py</code> <pre><code>def pivot_transpose(T, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n    \"\"\"Transpose a selection of columns to rows.\n\n    Args:\n        columns (list of column names): column names to transpose\n        keep (list of column names): column names to keep (repeat)\n\n    Returns:\n        Table: with columns transposed to rows\n\n    Example:\n        transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n    Input:\n    ```\n    | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n    |------|------|------|-----|-----|-----|-----|-----|------|\n    | 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n    | 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n    | ...  |      |      |     |     |     |     |     |      |\n\n    &gt;&gt;&gt; t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n    Output:\n    |col1| col2| col3| transpose| value|\n    |----|-----|-----|----------|------|\n    |1234| 2345| 3456| sun      |   456|\n    |1234| 2345| 3456| mon      |   567|\n    |1244| 2445| 4456| mon      |     7|\n    ```\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if not isinstance(columns, list):\n        raise TypeError\n\n    for i in columns:\n        if not isinstance(i, str):\n            raise TypeError\n        if i not in T.columns:\n            raise ValueError\n        if columns.count(i)&gt;1:\n            raise ValueError(f\"Column {i} appears more than once\")\n\n    if keep is None:\n        keep = []\n    for i in keep:\n        if not isinstance(i, str):\n            raise TypeError\n        if i not in T.columns:\n            raise ValueError\n\n    if column_name in keep + columns:\n        column_name = unique_name(column_name, set_of_names=keep + columns)\n    if value_name in keep + columns + [column_name]:\n        value_name = unique_name(value_name, set_of_names=keep + columns)\n\n    new = type(T)()\n    new.add_columns(*keep + [column_name, value_name])\n    news = {name: [] for name in new.columns}\n\n    n = len(keep)\n\n    with tqdm(total=len(T), desc=\"transpose\", disable=Config.TQDM_DISABLE) as pbar:\n        it = T[keep + columns].rows if len(keep + columns) &gt; 1 else ((v, ) for v in T[keep + columns])\n\n        for ix, row in enumerate(it, start=1):\n            keeps = row[:n]\n            transposes = row[n:]\n\n            for name, value in zip(keep, keeps):\n                news[name].extend([value] * len(transposes))\n            for name, value in zip(columns, transposes):\n                news[column_name].append(name)\n                news[value_name].append(value)\n\n            if ix % Config.SINGLE_PROCESSING_LIMIT == 0:\n                for name, values in news.items():\n                    new[name].extend(values)\n                    values.clear()\n\n            pbar.update(1)\n\n    for name, values in news.items():\n        new[name].extend(np.array(values))\n        values.clear()\n    return new\n</code></pre>"},{"location":"reference/redux/","title":"Redux","text":""},{"location":"reference/redux/#tablite.redux","title":"<code>tablite.redux</code>","text":""},{"location":"reference/redux/#tablite.redux-attributes","title":"Attributes","text":""},{"location":"reference/redux/#tablite.redux-classes","title":"Classes","text":""},{"location":"reference/redux/#tablite.redux-functions","title":"Functions","text":""},{"location":"reference/redux/#tablite.redux.filter_all","title":"<code>tablite.redux.filter_all(T, **kwargs)</code>","text":"<p>returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> <p>Examples:</p> <pre><code>t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n    return x == 4\ndef g(x):\n    return x &lt; 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n    return x&gt;=2\n\ndef i(x):\n    return x&lt;=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n</code></pre> Source code in <code>tablite/redux.py</code> <pre><code>def filter_all(T, **kwargs):\n    \"\"\"\n    returns Table for rows where ALL kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n\n    Examples:\n\n        t = Table()\n        t['a'] = [1,2,3,4]\n        t['b'] = [10,20,30,40]\n\n        def f(x):\n            return x == 4\n        def g(x):\n            return x &lt; 20\n\n        t2 = t.any( **{\"a\":f, \"b\":g})\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        t2 = t.any(a=f,b=g)\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        def h(x):\n            return x&gt;=2\n\n        def i(x):\n            return x&lt;=30\n\n        t2 = t.all(a=h,b=i)\n        assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if not isinstance(kwargs, dict):\n        raise TypeError(\"did you forget to add the ** in front of your dict?\")\n    if not all([k in T.columns for k in kwargs]):\n        raise ValueError(f\"Unknown column(s): {[k for k in kwargs if k not in T.columns]}\")\n\n    mask = np.full((len(T),), True)\n    for k, v in kwargs.items():\n        col = T[k]\n        for start, end, page in col.iter_by_page():\n            data = page.get()\n            if callable(v):\n                vf = np.frompyfunc(v, 1, 1)\n                mask[start:end] = mask[start:end] &amp; np.apply_along_axis(vf, 0, data)\n            else:\n                mask[start:end] = mask[start:end] &amp; (data == v)\n\n    return _compress_one(T, mask)\n</code></pre>"},{"location":"reference/redux/#tablite.redux.drop","title":"<code>tablite.redux.drop(T, *args)</code>","text":"<p>drops all rows that contain args</p> PARAMETER  DESCRIPTION <code>T</code> <p> TYPE: <code>Table</code> </p> Source code in <code>tablite/redux.py</code> <pre><code>def drop(T, *args):\n    \"\"\"drops all rows that contain args\n\n    Args:\n        T (Table):\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    mask = np.full((len(T),), False)\n    for name in T.columns:\n        col = T[name]\n        for start, end, page in col.iter_by_page():\n            data = page.get()\n            for arg in args:\n                mask[start:end] = mask[start:end] | (data == arg)\n\n    mask = np.invert(mask)\n    return _compress_one(T, mask)\n</code></pre>"},{"location":"reference/redux/#tablite.redux.filter_any","title":"<code>tablite.redux.filter_any(T, **kwargs)</code>","text":"<p>returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> Source code in <code>tablite/redux.py</code> <pre><code>def filter_any(T, **kwargs):\n    \"\"\"\n    returns Table for rows where ANY kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    if not isinstance(kwargs, dict):\n        raise TypeError(\"did you forget to add the ** in front of your dict?\")\n\n    mask = np.full((len(T),), False)\n    for k, v in kwargs.items():\n        col = T[k]\n        for start, end, page in col.iter_by_page():\n            data = page.get()\n            if callable(v):\n                vf = np.frompyfunc(v, 1, 1)\n                mask[start:end] = mask[start:end] | np.apply_along_axis(vf, 0, data)\n            else:\n                mask[start:end] = mask[start:end] | (v == data)\n\n    return _compress_one(T, mask)\n</code></pre>"},{"location":"reference/redux/#tablite.redux.filter_non_primitive","title":"<code>tablite.redux.filter_non_primitive(T, expressions, filter_type='all', tqdm=_tqdm)</code>","text":"<p>OBSOLETE filters table</p> PARAMETER  DESCRIPTION <code>T</code> <p>Table.</p> <p> TYPE: <code>Table subclass</code> </p> <code>expressions</code> <p>str:     filters based on an expression, such as:     \"all((A==B, C!=4, 200&lt;D))\"     which is interpreted using python's compiler to:</p> <pre><code>def _f(A,B,C,D):\n    return all((A==B, C!=4, 200&lt;D))\n</code></pre> <p>list of dicts: (example):</p> <p>L = [     {'column1':'A', 'criteria': \"==\", 'column2': 'B'},     {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},     {'value1': 200, 'criteria': \"&lt;\", column2: 'D' } ]</p> <p> TYPE: <code>list or str</code> </p> <code>accepted</code> <p>'column1', 'column2', 'criteria', 'value1', 'value2'</p> <p> TYPE: <code>dictionary keys</code> </p> <code>filter_type</code> <p>Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'all'</code> </p> <code>tqdm</code> <p>progressbar. Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <code>2xTables</code> <p>trues, falses</p> Source code in <code>tablite/redux.py</code> <pre><code>def filter_non_primitive(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n    \"\"\"\n    OBSOLETE\n    filters table\n\n\n    Args:\n        T (Table subclass): Table.\n        expressions (list or str):\n            str:\n                filters based on an expression, such as:\n                \"all((A==B, C!=4, 200&lt;D))\"\n                which is interpreted using python's compiler to:\n\n                def _f(A,B,C,D):\n                    return all((A==B, C!=4, 200&lt;D))\n\n            list of dicts: (example):\n\n            L = [\n                {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n                {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n                {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n            ]\n\n        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n        filter_type (str, optional): Ignored if expressions is str.\n            'all' or 'any'. Defaults to \"all\".\n        tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n    Returns:\n        2xTables: trues, falses\n    \"\"\"\n    # determine method\n    warnings.warn(\"Filter using non-primitive types is not recommended.\")\n    sub_cls_check(T, BaseTable)\n    if len(T) == 0:\n        return T.copy(), T.copy()\n\n    with tqdm(desc=\"filter\", total=20) as pbar:\n        if isinstance(expressions, str):\n            mask = _filter_using_expression(T, expressions)\n            pbar.update(10)\n        elif isinstance(expressions, list):\n            mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar)\n        else:\n            raise TypeError\n        # create new tables\n        res = _compress_both(T, mask, pbar=pbar)\n        pbar.update(pbar.total - pbar.n)\n\n        return res\n</code></pre>"},{"location":"reference/redux/#tablite.redux.filter","title":"<code>tablite.redux.filter(T, expressions, filter_type='all', tqdm=_tqdm)</code>","text":"<p>filters table Note: At the moment only tablite primitive types are supported</p> PARAMETER  DESCRIPTION <code>T</code> <p>Table.</p> <p> TYPE: <code>Table subclass</code> </p> <code>expressions</code> <p>str:     filters based on an expression, such as:     \"all((A==B, C!=4, 200&lt;D))\"     which is interpreted using python's compiler to:</p> <pre><code>def _f(A,B,C,D):\n    return all((A==B, C!=4, 200&lt;D))\n</code></pre> <p>list of dicts: (example):</p> <p>L = [     {'column1':'A', 'criteria': \"==\", 'column2': 'B'},     {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},     {'value1': 200, 'criteria': \"&lt;\", column2: 'D' } ]</p> <p> TYPE: <code>list or str</code> </p> <code>accepted</code> <p>'column1', 'column2', 'criteria', 'value1', 'value2'</p> <p> TYPE: <code>dictionary keys</code> </p> <code>filter_type</code> <p>Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'all'</code> </p> <code>tqdm</code> <p>progressbar. Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <code>2xTables</code> <p>trues, falses</p> Source code in <code>tablite/redux.py</code> <pre><code>def filter(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n    \"\"\"filters table\n    Note: At the moment only tablite primitive types are supported\n\n    Args:\n        T (Table subclass): Table.\n        expressions (list or str):\n            str:\n                filters based on an expression, such as:\n                \"all((A==B, C!=4, 200&lt;D))\"\n                which is interpreted using python's compiler to:\n\n                def _f(A,B,C,D):\n                    return all((A==B, C!=4, 200&lt;D))\n\n            list of dicts: (example):\n\n            L = [\n                {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n                {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n                {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n            ]\n\n        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n        filter_type (str, optional): Ignored if expressions is str.\n            'all' or 'any'. Defaults to \"all\".\n        tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n    Returns:\n        2xTables: trues, falses\n    \"\"\"\n    # determine method\n    sub_cls_check(T, BaseTable)\n    if len(T) == 0:\n        return T.copy(), T.copy()\n\n    if isinstance(expressions, str):\n        with tqdm(desc=\"filter\", total=20) as pbar:\n            # TODO: make parser for expressions and use the nim implement\n            mask = _filter_using_expression(T, expressions)\n            pbar.update(10)\n            res = _compress_both(T, mask, pbar=pbar)\n            pbar.update(pbar.total - pbar.n)\n    elif isinstance(expressions, list):\n        return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)\n    else:\n        raise TypeError\n        # create new tables\n\n    return res\n</code></pre>"},{"location":"reference/reindex/","title":"Reindex","text":""},{"location":"reference/reindex/#tablite.reindex","title":"<code>tablite.reindex</code>","text":""},{"location":"reference/reindex/#tablite.reindex-classes","title":"Classes","text":""},{"location":"reference/reindex/#tablite.reindex-functions","title":"Functions","text":""},{"location":"reference/reindex/#tablite.reindex.reindex","title":"<code>tablite.reindex.reindex(T, index, names=None, tqdm=_tqdm, pbar=None)</code>","text":"<p>Constant Memory helper for reindexing pages.</p> <p>Memory usage is set by datatype and Config.PAGE_SIZE</p> PARAMETER  DESCRIPTION <code>T</code> <p>subclass of Table</p> <p> TYPE: <code>Table</code> </p> <code>index</code> <p>int64.</p> <p> TYPE: <code>array</code> </p> <code>names</code> <p>list of names from T to reindex.</p> <p> TYPE: <code>(list, str)</code> DEFAULT: <code>None</code> </p> <code>tqdm</code> <p>Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> <code>pbar</code> <p>Defaults to None.</p> <p> TYPE: <code>pbar</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>_type_</code> <p>description</p> Source code in <code>tablite/reindex.py</code> <pre><code>def reindex(T, index, names=None, tqdm=_tqdm, pbar=None):\n    \"\"\"Constant Memory helper for reindexing pages.\n\n    Memory usage is set by datatype and Config.PAGE_SIZE\n\n    Args:\n        T (Table): subclass of Table\n        index (np.array): int64.\n        names (list, str): list of names from T to reindex.\n        tqdm (tqdm, optional): Defaults to _tqdm.\n        pbar (pbar, optional): Defaults to None.\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    if names is None:\n        names = list(T.columns.keys())\n\n    if pbar is None:\n        total = len(names)\n        pbar = tqdm(total=total, desc=\"join\", disable=Config.TQDM_DISABLE)\n\n    sub_cls_check(T, BaseTable)\n    cls = type(T)\n    result = cls()\n    for name in names:\n        result.add_column(name)\n        col = result[name]\n\n        for start, end in Config.page_steps(len(index)):\n            indices = index[start:end]\n            values = T[name].get_by_indices(indices)\n            # in these values, the index of -1 will be wrong.\n            # so if there is any -1 in the indices, they will\n            # have to be replaced with Nones\n            mask = indices == -1\n            if np.any(mask):\n                nones = np.full(index.shape, fill_value=None)\n                values = np.where(mask, nones, values)\n            col.extend(values)\n        pbar.update(1)\n\n    return result\n</code></pre>"},{"location":"reference/sort_utils/","title":"Sort utils","text":""},{"location":"reference/sort_utils/#tablite.sort_utils","title":"<code>tablite.sort_utils</code>","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-attributes","title":"Attributes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.uca_collator","title":"<code>tablite.sort_utils.uca_collator = Collator()</code>  <code>module-attribute</code>","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.modes","title":"<code>tablite.sort_utils.modes = {'alphanumeric': text_sort, 'unix': unix_sort, 'excel': excel_sort}</code>  <code>module-attribute</code>","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-classes","title":"Classes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict","title":"<code>tablite.sort_utils.HashDict</code>","text":"<p>             Bases: <code>dict</code></p> <p>This class is just a nicity syntatic sugar for debugging. Function identically to regular dictionary, just uses tupled key.</p>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.items","title":"<code>tablite.sort_utils.HashDict.items()</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def items(self):\n    return [(k, v) for (_, k), v in super().items()]\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.keys","title":"<code>tablite.sort_utils.HashDict.keys()</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def keys(self):\n    return [k for (_, k) in super().keys()]\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__iter__","title":"<code>tablite.sort_utils.HashDict.__iter__() -&gt; Iterator</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __iter__(self) -&gt; Iterator:\n    return (k for (_, k) in super().keys())\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__getitem__","title":"<code>tablite.sort_utils.HashDict.__getitem__(key)</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __getitem__(self, key):\n    return super().__getitem__(self._get_hash(key))\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__setitem__","title":"<code>tablite.sort_utils.HashDict.__setitem__(key, value)</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __setitem__(self, key, value):\n    return super().__setitem__(self._get_hash(key), value)\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__contains__","title":"<code>tablite.sort_utils.HashDict.__contains__(key) -&gt; bool</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __contains__(self, key) -&gt; bool:\n    return super().__contains__(self._get_hash(key))\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__delitem__","title":"<code>tablite.sort_utils.HashDict.__delitem__(key)</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __delitem__(self, key):\n    return super().__delitem__(self._get_hash(key))\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__repr__","title":"<code>tablite.sort_utils.HashDict.__repr__() -&gt; str</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __repr__(self) -&gt; str:\n    return '{' + \", \".join([f\"{k}: {v}\" for k, v in self.items()]) + '}'\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__str__","title":"<code>tablite.sort_utils.HashDict.__str__() -&gt; str</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __str__(self) -&gt; str:\n    return repr(self)\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.text_sort","title":"<code>tablite.sort_utils.text_sort(values, reverse=False)</code>","text":"<p>Sorts everything as text.</p> Source code in <code>tablite/sort_utils.py</code> <pre><code>def text_sort(values, reverse=False):\n    \"\"\"\n    Sorts everything as text.\n    \"\"\"\n    text = {str(i): i for i in values}\n    L = list(text.keys())\n    L.sort(key=uca_collator.sort_key, reverse=reverse)\n    d = {text[value]: ix for ix, value in enumerate(L)}\n    return d\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.unix_sort","title":"<code>tablite.sort_utils.unix_sort(values, reverse=False)</code>","text":"<p>Unix sortation sorts by the following order:</p> <p>| rank | type      | value                                      | +------+-----------+--------------------------------------------+ |   0  | None      | floating point -infinite                   | |   1  | bool      | 0 as False, 1 as True                      | |   2  | int       | as numeric value                           | |   2  | float     | as numeric value                           | |   3  | time      | \u03c4 * seconds into the day / (24 * 60 * 60)  | |   4  | date      | as integer days since 1970/1/1             | |   5  | datetime  | as float using date (int) + time (decimal) | |   6  | timedelta | as float using date (int) + time (decimal) | |   7  | str       | using unicode                              | +------+-----------+--------------------------------------------+</p> <p>\u03c4 = 2 * \u03c0</p> Source code in <code>tablite/sort_utils.py</code> <pre><code>def unix_sort(values, reverse=False):\n    \"\"\"\n    Unix sortation sorts by the following order:\n\n    | rank | type      | value                                      |\n    +------+-----------+--------------------------------------------+\n    |   0  | None      | floating point -infinite                   |\n    |   1  | bool      | 0 as False, 1 as True                      |\n    |   2  | int       | as numeric value                           |\n    |   2  | float     | as numeric value                           |\n    |   3  | time      | \u03c4 * seconds into the day / (24 * 60 * 60)  |\n    |   4  | date      | as integer days since 1970/1/1             |\n    |   5  | datetime  | as float using date (int) + time (decimal) |\n    |   6  | timedelta | as float using date (int) + time (decimal) |\n    |   7  | str       | using unicode                              |\n    +------+-----------+--------------------------------------------+\n\n    \u03c4 = 2 * \u03c0\n\n    \"\"\"\n    text, non_text = [], []\n\n    # L = []\n    # text = [i for i in values if isinstance(i, str)]\n    # text.sort(key=uca_collator.sort_key, reverse=reverse)\n    # text_code = _unix_typecodes[str]\n    # L = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n    for value in values:\n        if isinstance(value, str):\n            text.append(value)\n        else:\n            t = type(value)\n            TC = _unix_typecodes[t]\n            tf = _unix_value_function[t]\n            VC = tf(value)\n            non_text.append((TC, VC, value))\n    non_text.sort(reverse=reverse)\n\n    text.sort(key=uca_collator.sort_key, reverse=reverse)\n    text_code = _unix_typecodes[str]\n    text = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n    d = HashDict()\n    L = non_text + text\n    for ix, (_, _, value) in enumerate(L):\n        d[value] = ix\n    return d\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.excel_sort","title":"<code>tablite.sort_utils.excel_sort(values, reverse=False)</code>","text":"<p>Excel sortation sorts by the following order:</p> <p>| rank | type      | value                                      | +------+-----------+--------------------------------------------+ |   1  | int       | as numeric value                           | |   1  | float     | as numeric value                           | |   1  | time      | as seconds into the day / (24 * 60 * 60)   | |   1  | date      | as integer days since 1900/1/1             | |   1  | datetime  | as float using date (int) + time (decimal) | |  (1)*| timedelta | as float using date (int) + time (decimal) | |   2  | str       | using unicode                              | |   3  | bool      | 0 as False, 1 as True                      | |   4  | None      | floating point infinite.                   | +------+-----------+--------------------------------------------+</p> <ul> <li>Excel doesn't have timedelta.</li> </ul> Source code in <code>tablite/sort_utils.py</code> <pre><code>def excel_sort(values, reverse=False):\n    \"\"\"\n    Excel sortation sorts by the following order:\n\n    | rank | type      | value                                      |\n    +------+-----------+--------------------------------------------+\n    |   1  | int       | as numeric value                           |\n    |   1  | float     | as numeric value                           |\n    |   1  | time      | as seconds into the day / (24 * 60 * 60)   |\n    |   1  | date      | as integer days since 1900/1/1             |\n    |   1  | datetime  | as float using date (int) + time (decimal) |\n    |  (1)*| timedelta | as float using date (int) + time (decimal) |\n    |   2  | str       | using unicode                              |\n    |   3  | bool      | 0 as False, 1 as True                      |\n    |   4  | None      | floating point infinite.                   |\n    +------+-----------+--------------------------------------------+\n\n    * Excel doesn't have timedelta.\n    \"\"\"\n\n    def tup(TC, value):\n        return (TC, _excel_value_function[t](value), value)\n\n    text, numeric, booles, nones = [], [], [], []\n    for value in values:\n        t = type(value)\n        TC = _excel_typecodes[t]\n\n        if TC == 0:\n            numeric.append(tup(TC, value))\n        elif TC == 1:\n            text.append(value)  # text is processed later.\n        elif TC == 2:\n            booles.append(tup(TC, value))\n        elif TC == 3:\n            booles.append(tup(TC, value))\n        else:\n            raise TypeError(f\"no typecode for {value}\")\n\n    if text:\n        text.sort(key=uca_collator.sort_key, reverse=reverse)\n        text = [(2, ix, v) for ix, v in enumerate(text)]\n\n    numeric.sort(reverse=reverse)\n    booles.sort(reverse=reverse)\n    nones.sort(reverse=reverse)\n\n    if reverse:\n        L = nones + booles + text + numeric\n    else:\n        L = numeric + text + booles + nones\n    d = {value: ix for ix, (_, _, value) in enumerate(L)}\n    return d\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.rank","title":"<code>tablite.sort_utils.rank(values, reverse, mode)</code>","text":"<p>values: list of values to sort. reverse: bool mode: as 'text', as 'numeric' or as 'excel' return: dict: d[value] = rank</p> Source code in <code>tablite/sort_utils.py</code> <pre><code>def rank(values, reverse, mode):\n    \"\"\"\n    values: list of values to sort.\n    reverse: bool\n    mode: as 'text', as 'numeric' or as 'excel'\n    return: dict: d[value] = rank\n    \"\"\"\n    if mode not in modes:\n        raise ValueError(f\"{mode} not in list of modes: {list(modes)}\")\n    f = modes.get(mode)\n    return f(values, reverse)\n</code></pre>"},{"location":"reference/sortation/","title":"Sortation","text":""},{"location":"reference/sortation/#tablite.sortation","title":"<code>tablite.sortation</code>","text":""},{"location":"reference/sortation/#tablite.sortation-attributes","title":"Attributes","text":""},{"location":"reference/sortation/#tablite.sortation-classes","title":"Classes","text":""},{"location":"reference/sortation/#tablite.sortation-functions","title":"Functions","text":""},{"location":"reference/sortation/#tablite.sortation.sort_index","title":"<code>tablite.sortation.sort_index(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar=None)</code>","text":"<p>helper for methods <code>sort</code> and <code>is_sorted</code></p> <p>param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort()</p> Source code in <code>tablite/sortation.py</code> <pre><code>def sort_index(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar=None):\n    \"\"\"\n    helper for methods `sort` and `is_sorted`\n\n    param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n    param: **kwargs: sort criteria. See Table.sort()\n    \"\"\"\n\n    sub_cls_check(T, BaseTable)\n\n    if not isinstance(mapping, dict) or not mapping:\n        raise TypeError(\"Expected mapping (dict)?\")\n\n    for k, v in mapping.items():\n        if k not in T.columns:\n            raise ValueError(f\"no column {k}\")\n        if not isinstance(v, bool):\n            raise ValueError(f\"{k} was mapped to {v} - a non-boolean\")\n\n    if sort_mode not in sort_modes:\n        raise ValueError(f\"{sort_mode} not in list of sort_modes: {list(sort_modes)}\")\n\n    rank = {i: tuple() for i in range(len(T))}  # create index and empty tuple for sortation.\n\n    _pbar = tqdm(total=len(mapping.items()), desc=\"creating sort index\") if pbar is None else pbar\n\n    for key, reverse in mapping.items():\n        col = T[key][:]\n        ranks = sort_rank(values=[numpy_to_python(v) for v in multitype_set(col)], reverse=reverse, mode=sort_mode)\n        assert isinstance(ranks, dict)\n        for ix, v in enumerate(col):\n            v2 = numpy_to_python(v)\n            rank[ix] += (ranks[v2],)  # add tuple for each sortation level.\n\n        _pbar.update(1)\n\n    del col\n    del ranks\n\n    new_order = [(r, i) for i, r in rank.items()]  # tuples are listed and sort...\n    del rank  # free memory.\n\n    new_order.sort()\n    sorted_index = [i for _, i in new_order]  # new index is extracted.\n    new_order.clear()\n    return np.array(sorted_index, dtype=np.int64)\n</code></pre>"},{"location":"reference/sortation/#tablite.sortation.reindex","title":"<code>tablite.sortation.reindex(T, index)</code>","text":"<p>index: list of integers that declare sort order.</p> <p>Examples:</p> <pre><code>Table:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n</code></pre> Source code in <code>tablite/sortation.py</code> <pre><code>def reindex(T, index):\n    \"\"\"\n    index: list of integers that declare sort order.\n\n    Examples:\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6]\n        result: ['b','d','f','h']\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6,1,3,5,7]\n        result: ['a','c','e','g','b','d','f','h']\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    if isinstance(index, list):\n        index = np.array(index, dtype=int)\n    type_check(index, np.ndarray)\n    if max(index) &gt;= len(T):\n        raise IndexError(\"index out of range: max(index) &gt; len(self)\")\n    if min(index) &lt; -len(T):\n        raise IndexError(\"index out of range: min(index) &lt; -len(self)\")\n\n    fields = len(T) * len(T.columns)\n    m = select_processing_method(fields, _reindex, _mp_reindex)\n    return m(T, index)\n</code></pre>"},{"location":"reference/sortation/#tablite.sortation.sort","title":"<code>tablite.sortation.sort(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None)</code>","text":"<p>Perform multi-pass sorting with precedence given order of column names. sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" kwargs:     keys: columns,     values: 'reverse' as boolean.</p> <p>examples: Table.sort('A'=False) means sort by 'A' in ascending order. Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority) sort B in ascending order.</p> Source code in <code>tablite/sortation.py</code> <pre><code>def sort(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n    \"\"\"Perform multi-pass sorting with precedence given order of column names.\n    sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n    kwargs:\n        keys: columns,\n        values: 'reverse' as boolean.\n\n    examples:\n    Table.sort('A'=False) means sort by 'A' in ascending order.\n    Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority)\n    sort B in ascending order.\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    index = sort_index(T, mapping, sort_mode=sort_mode, tqdm=_tqdm, pbar=pbar)\n    m = select_processing_method(len(T) * len(T.columns), _sp_reindex, _mp_reindex)\n    return m(T, index, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/sortation/#tablite.sortation.is_sorted","title":"<code>tablite.sortation.is_sorted(T, mapping, sort_mode='excel')</code>","text":"<p>Performs multi-pass sorting check with precedence given order of column names.</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>sort criteria. See Table.sort()</p> <p> </p> RETURNS DESCRIPTION <p>bool</p> Source code in <code>tablite/sortation.py</code> <pre><code>def is_sorted(T, mapping, sort_mode=\"excel\"):\n    \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n\n    Args:\n        mapping: sort criteria. See Table.sort()\n        sort_mode = sort mode. See Table.sort()\n\n    Returns:\n        bool\n    \"\"\"\n    index = sort_index(T, mapping, sort_mode=sort_mode)\n    match = np.arange(len(T))\n    return np.all(index == match)\n</code></pre>"},{"location":"reference/tools/","title":"Tools","text":""},{"location":"reference/tools/#tablite.tools","title":"<code>tablite.tools</code>","text":""},{"location":"reference/tools/#tablite.tools-attributes","title":"Attributes","text":""},{"location":"reference/tools/#tablite.tools.guess","title":"<code>tablite.tools.guess = DataTypes.guess</code>  <code>module-attribute</code>","text":""},{"location":"reference/tools/#tablite.tools.xround","title":"<code>tablite.tools.xround = DataTypes.round</code>  <code>module-attribute</code>","text":""},{"location":"reference/tools/#tablite.tools-classes","title":"Classes","text":""},{"location":"reference/tools/#tablite.tools-functions","title":"Functions","text":""},{"location":"reference/tools/#tablite.tools.head","title":"<code>tablite.tools.head(path, linecount=5, delimiter=None)</code>","text":"<p>Gets the head of any supported file format.</p> Source code in <code>tablite/tools.py</code> <pre><code>def head(path, linecount=5, delimiter=None):\n    \"\"\"\n    Gets the head of any supported file format.\n    \"\"\"\n    return get_headers(path, linecount=linecount, delimiter=delimiter)\n</code></pre>"},{"location":"reference/utils/","title":"Utils","text":""},{"location":"reference/utils/#tablite.utils","title":"<code>tablite.utils</code>","text":""},{"location":"reference/utils/#tablite.utils-attributes","title":"Attributes","text":""},{"location":"reference/utils/#tablite.utils.letters","title":"<code>tablite.utils.letters = string.ascii_lowercase + string.digits</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils.NoneType","title":"<code>tablite.utils.NoneType = type(None)</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils.required_keys","title":"<code>tablite.utils.required_keys = {'min', 'max', 'mean', 'median', 'stdev', 'mode', 'distinct', 'iqr_low', 'iqr_high', 'iqr', 'sum', 'summary type', 'histogram'}</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils.summary_methods","title":"<code>tablite.utils.summary_methods = {bool: _boolean_statistics_summary, int: _numeric_statistics_summary, float: _numeric_statistics_summary, str: _string_statistics_summary, date: _date_statistics_summary, datetime: _datetime_statistics_summary, time: _time_statistics_summary, timedelta: _timedelta_statistics_summary, type(None): _none_type_summary}</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils-classes","title":"Classes","text":""},{"location":"reference/utils/#tablite.utils-functions","title":"Functions","text":""},{"location":"reference/utils/#tablite.utils.generate_random_string","title":"<code>tablite.utils.generate_random_string(len)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def generate_random_string(len):\n    return \"\".join(random.choice(letters) for i in range(len))\n</code></pre>"},{"location":"reference/utils/#tablite.utils.type_check","title":"<code>tablite.utils.type_check(var, kind)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def type_check(var, kind):\n    if not isinstance(var, kind):\n        raise TypeError(f\"Expected {kind}, not {type(var)}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.sub_cls_check","title":"<code>tablite.utils.sub_cls_check(c, kind)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def sub_cls_check(c, kind):\n    if not issubclass(type(c), kind):\n        raise TypeError(f\"Expected {kind}, not {type(c)}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.name_check","title":"<code>tablite.utils.name_check(options, *names)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def name_check(options, *names):\n    for n in names:\n        if n not in options:\n            raise ValueError(f\"{n} not in {options}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.unique_name","title":"<code>tablite.utils.unique_name(wanted_name, set_of_names)</code>","text":"<p>returns a wanted_name as wanted_name_i given a list of names which guarantees unique naming.</p> Source code in <code>tablite/utils.py</code> <pre><code>def unique_name(wanted_name, set_of_names):\n    \"\"\"\n    returns a wanted_name as wanted_name_i given a list of names\n    which guarantees unique naming.\n    \"\"\"\n    if not isinstance(set_of_names, set):\n        set_of_names = set(set_of_names)\n    name, i = wanted_name, 1\n    while name in set_of_names:\n        name = f\"{wanted_name}_{i}\"\n        i += 1\n    return name\n</code></pre>"},{"location":"reference/utils/#tablite.utils.expression_interpreter","title":"<code>tablite.utils.expression_interpreter(expression, columns)</code>","text":"<p>Interprets valid expressions such as:</p> <pre><code>\"all((A==B, C!=4, 200&lt;D))\"\n</code></pre> as <p>def _f(A,B,C,D):     return all((A==B, C!=4, 200&lt;D))</p> <p>using python's compiler.</p> Source code in <code>tablite/utils.py</code> <pre><code>def expression_interpreter(expression, columns):\n    \"\"\"\n    Interprets valid expressions such as:\n\n        \"all((A==B, C!=4, 200&lt;D))\"\n\n    as:\n        def _f(A,B,C,D):\n            return all((A==B, C!=4, 200&lt;D))\n\n    using python's compiler.\n    \"\"\"\n    if not isinstance(expression, str):\n        raise TypeError(f\"`{expression}` is not a str\")\n    if not isinstance(columns, list):\n        raise TypeError\n    if not all(isinstance(i, str) for i in columns):\n        raise TypeError\n\n    req_columns = \", \".join(i for i in columns if i in expression)\n    script = f\"def f({req_columns}):\\n    return {expression}\"\n    tree = ast.parse(script)\n    code = compile(tree, filename=\"blah\", mode=\"exec\")\n    namespace = {}\n    exec(code, namespace)\n    f = namespace[\"f\"]\n    if not callable(f):\n        raise ValueError(f\"The expression could not be parse: {expression}\")\n    return f\n</code></pre>"},{"location":"reference/utils/#tablite.utils.intercept","title":"<code>tablite.utils.intercept(A, B)</code>","text":"<p>Enables calculation of the intercept of two range objects. Used to determine if a datablock contains a slice.</p> PARAMETER  DESCRIPTION <code>A</code> <p>range</p> <p> </p> <code>B</code> <p>range</p> <p> </p> RETURNS DESCRIPTION <code>range</code> <p>The intercept of ranges A and B.</p> Source code in <code>tablite/utils.py</code> <pre><code>def intercept(A, B):\n    \"\"\"Enables calculation of the intercept of two range objects.\n    Used to determine if a datablock contains a slice.\n\n    Args:\n        A: range\n        B: range\n\n    Returns:\n        range: The intercept of ranges A and B.\n    \"\"\"\n    type_check(A, range)\n    type_check(B, range)\n\n    if A.step &lt; 1:\n        A = range(A.stop + 1, A.start + 1, 1)\n    if B.step &lt; 1:\n        B = range(B.stop + 1, B.start + 1, 1)\n\n    if len(A) == 0:\n        return range(0)\n    if len(B) == 0:\n        return range(0)\n\n    if A.stop &lt;= B.start:\n        return range(0)\n    if A.start &gt;= B.stop:\n        return range(0)\n\n    if A.start &lt;= B.start:\n        if A.stop &lt;= B.stop:\n            start, end = B.start, A.stop\n        elif A.stop &gt; B.stop:\n            start, end = B.start, B.stop\n        else:\n            raise ValueError(\"bad logic\")\n    elif A.start &lt; B.stop:\n        if A.stop &lt;= B.stop:\n            start, end = A.start, A.stop\n        elif A.stop &gt; B.stop:\n            start, end = A.start, B.stop\n        else:\n            raise ValueError(\"bad logic\")\n    else:\n        raise ValueError(\"bad logic\")\n\n    a_steps = math.ceil((start - A.start) / A.step)\n    a_start = (a_steps * A.step) + A.start\n\n    b_steps = math.ceil((start - B.start) / B.step)\n    b_start = (b_steps * B.step) + B.start\n\n    if A.step == 1 or B.step == 1:\n        start = max(a_start, b_start)\n        step = max(A.step, B.step)\n        return range(start, end, step)\n    elif A.step == B.step:\n        a, b = min(A.start, B.start), max(A.start, B.start)\n        if (b - a) % A.step != 0:  # then the ranges are offset.\n            return range(0)\n        else:\n            return range(b, end, step)\n    else:\n        # determine common step size:\n        step = max(A.step, B.step) if math.gcd(A.step, B.step) != 1 else A.step * B.step\n        # examples:\n        # 119 &lt;-- 17 if 1 != 1 else 119 &lt;-- max(7, 17) if math.gcd(7, 17) != 1 else 7 * 17\n        #  30 &lt;-- 30 if 3 != 1 else 90 &lt;-- max(3, 30) if math.gcd(3, 30) != 1 else 3*30\n        if A.step &lt; B.step:\n            for n in range(a_start, end, A.step):  # increment in smallest step to identify the first common value.\n                if n &lt; b_start:\n                    continue\n                elif (n - b_start) % B.step == 0:\n                    return range(n, end, step)  # common value found.\n        else:\n            for n in range(b_start, end, B.step):\n                if n &lt; a_start:\n                    continue\n                elif (n - a_start) % A.step == 0:\n                    return range(n, end, step)\n\n        return range(0)\n</code></pre>"},{"location":"reference/utils/#tablite.utils.summary_statistics","title":"<code>tablite.utils.summary_statistics(values, counts)</code>","text":"<p>values: any type counts: integer</p> <p>returns dict with: - min (int/float, length of str, date) - max (int/float, length of str, date) - mean (int/float, length of str, date) - median (int/float, length of str, date) - stdev (int/float, length of str, date) - mode (int/float, length of str, date) - distinct (number of distinct values) - iqr (int/float, length of str, date) - sum (int/float, length of str, date) - histogram (2 arrays: values, count of each values)</p> Source code in <code>tablite/utils.py</code> <pre><code>def summary_statistics(values, counts):\n    \"\"\"\n    values: any type\n    counts: integer\n\n    returns dict with:\n    - min (int/float, length of str, date)\n    - max (int/float, length of str, date)\n    - mean (int/float, length of str, date)\n    - median (int/float, length of str, date)\n    - stdev (int/float, length of str, date)\n    - mode (int/float, length of str, date)\n    - distinct (number of distinct values)\n    - iqr (int/float, length of str, date)\n    - sum (int/float, length of str, date)\n    - histogram (2 arrays: values, count of each values)\n    \"\"\"\n    # determine the dominant datatype:\n    dtypes = defaultdict(int)\n    most_frequent, most_frequent_dtype = 0, int\n    for v, c in zip(values, counts):\n        dtype = type(v)\n        total = dtypes[dtype] + c\n        dtypes[dtype] = total\n        if total &gt; most_frequent:\n            most_frequent_dtype = dtype\n            most_frequent = total\n\n    if most_frequent == 0:\n        return {}\n\n    most_frequent_dtype = max(dtypes, key=dtypes.get)\n    mask = [type(v) == most_frequent_dtype for v in values]\n    v = list(compress(values, mask))\n    c = list(compress(counts, mask))\n\n    f = summary_methods.get(most_frequent_dtype, int)\n    result = f(v, c)\n    result[\"distinct\"] = len(values)\n    result[\"summary type\"] = most_frequent_dtype.__name__\n    result[\"histogram\"] = [values, counts]\n    assert set(result.keys()) == required_keys, \"Key missing!\"\n    return result\n</code></pre>"},{"location":"reference/utils/#tablite.utils.date_range","title":"<code>tablite.utils.date_range(start, stop, step)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def date_range(start, stop, step):\n    if not isinstance(start, datetime):\n        raise TypeError(\"start is not datetime\")\n    if not isinstance(stop, datetime):\n        raise TypeError(\"stop is not datetime\")\n    if not isinstance(step, timedelta):\n        raise TypeError(\"step is not timedelta\")\n    n = (stop - start) // step\n    return [start + step * i for i in range(n)]\n</code></pre>"},{"location":"reference/utils/#tablite.utils.dict_to_rows","title":"<code>tablite.utils.dict_to_rows(d)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def dict_to_rows(d):\n    type_check(d, dict)\n    rows = []\n    max_length = max(len(i) for i in d.values())\n    order = list(d.keys())\n    rows.append(order)\n    for i in range(max_length):\n        row = [d[k][i] for k in order]\n        rows.append(row)\n    return rows\n</code></pre>"},{"location":"reference/utils/#tablite.utils.calc_col_count","title":"<code>tablite.utils.calc_col_count(letters: str)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def calc_col_count(letters: str):\n    ord_nil = ord(\"A\") - 1\n    cols_per_letter = ord(\"Z\") - ord_nil\n    col_count = 0\n\n    for i, v in enumerate(reversed(letters)):\n        col_count = col_count + (ord(v) - ord_nil) * pow(cols_per_letter, i)\n\n    return col_count\n</code></pre>"},{"location":"reference/utils/#tablite.utils.calc_true_dims","title":"<code>tablite.utils.calc_true_dims(sheet)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def calc_true_dims(sheet):\n    src = sheet._get_source()\n    max_col, max_row = 0, 0\n\n    regex = re.compile(\"\\d+\")\n\n    def handleStartElement(name, attrs):\n        nonlocal max_col, max_row\n\n        if name == \"c\":\n            last_index = attrs[\"r\"]\n            idx, _ = next(regex.finditer(last_index)).span()\n            letters, digits = last_index[0:idx], int(last_index[idx:])\n\n            col_idx, row_idx = calc_col_count(letters), digits\n\n            max_col, max_row = max(max_col, col_idx), max(max_row, row_idx)\n\n    parser = expat.ParserCreate()\n    parser.buffer_text = True\n    parser.StartElementHandler = handleStartElement\n    parser.ParseFile(src)\n\n    return max_col, max_row\n</code></pre>"},{"location":"reference/utils/#tablite.utils.fixup_worksheet","title":"<code>tablite.utils.fixup_worksheet(worksheet)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def fixup_worksheet(worksheet):\n    try:\n        ws_cols, ws_rows = calc_true_dims(worksheet)\n\n        worksheet._max_column = ws_cols\n        worksheet._max_row = ws_rows\n    except Exception as e:\n        logging.error(f\"Failed to fetch true dimensions: {e}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.update_access_time","title":"<code>tablite.utils.update_access_time(path)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def update_access_time(path):\n    path = Path(path)\n    stat = path.stat()\n    os.utime(path, (now(), stat.st_mtime))\n</code></pre>"},{"location":"reference/utils/#tablite.utils.load_numpy","title":"<code>tablite.utils.load_numpy(path)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def load_numpy(path):\n    update_access_time(path)\n\n    return np.load(path, allow_pickle=True, fix_imports=False)\n</code></pre>"},{"location":"reference/utils/#tablite.utils.select_type_name","title":"<code>tablite.utils.select_type_name(dtypes: dict)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def select_type_name(dtypes: dict):\n    dtypes = [t for t in dtypes.items() if t[0] != NoneType]\n\n    if len(dtypes) == 0:\n        return \"empty\"\n\n    (best_type, _), *_ = sorted(dtypes, key=lambda t: t[1], reverse=True)\n\n    return best_type.__name__\n</code></pre>"},{"location":"reference/utils/#tablite.utils.get_predominant_types","title":"<code>tablite.utils.get_predominant_types(table, all_dtypes=None)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def get_predominant_types(table, all_dtypes=None):\n    if all_dtypes is None:\n        all_dtypes = table.types()\n\n    dtypes = {\n        k: select_type_name(v)\n        for k, v in all_dtypes.items()\n    }\n\n    return dtypes\n</code></pre>"},{"location":"reference/utils/#tablite.utils.py_to_nim_encoding","title":"<code>tablite.utils.py_to_nim_encoding(encoding: str) -&gt; str</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def py_to_nim_encoding(encoding: str) -&gt; str:\n    if encoding is None or encoding.lower() in [\"ascii\", \"utf8\", \"utf-8\", \"utf-8-sig\"]:\n        return \"ENC_UTF8\"\n    elif encoding.lower() in [\"utf16\", \"utf-16\"]:\n        return \"ENC_UTF16\"\n    elif encoding in Config.NIM_SUPPORTED_CONV_TYPES:\n        return f\"ENC_CONV|{encoding}\"\n\n    raise NotImplementedError(f\"encoding not implemented: {encoding}\")\n</code></pre>"},{"location":"reference/version/","title":"Version","text":""},{"location":"reference/version/#tablite.version","title":"<code>tablite.version</code>","text":""},{"location":"reference/version/#tablite.version-attributes","title":"Attributes","text":""},{"location":"reference/version/#tablite.version.__version_info__","title":"<code>tablite.version.__version_info__ = (major, minor, patch)</code>  <code>module-attribute</code>","text":""},{"location":"reference/version/#tablite.version.__version__","title":"<code>tablite.version.__version__ = '.'.join(str(i) for i in __version_info__)</code>  <code>module-attribute</code>","text":""}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Tablite","text":""},{"location":"#contents","title":"Contents","text":"<ul> <li>introduction</li> <li>installation</li> <li>feature overview</li> <li>api</li> <li>tutorial</li> <li>latest updates</li> <li>credits</li> </ul>"},{"location":"#introduction","title":"Introduction","text":"<p><code>Tablite</code> seeks to be the go-to library for manipulating tabular data with an api that is as close in syntax to pure python as possible. </p>"},{"location":"#even-smaller-memory-footprint","title":"Even smaller memory footprint","text":"<p>Tablite uses numpys fileformat as a backend with strong abstraction, so that copy, append &amp; repetition of data is handled in pages. This is imperative for incremental data processing.</p> <p>Tablite tests for memory footprint. One test compares the memory footprint of 10,000,000 integers where <code>tablite</code> will use &lt; 1 Mb RAM in contrast to python which will require around 133.7 Mb of RAM (1M lists with 10 integers). Tablite also tests to assure that working with 1Tb of data is tolerable.</p> <p>Tablite achieves this minimal memory footprint by using a temporary storage set in <code>config.Config.workdir</code> as <code>tempfile.gettempdir()/tablite-tmp</code>. If your OS (windows/linux/mac) sits on a SSD this will benefit from high IOPS and permit slices of 9,000,000,000 rows in less than a second.</p>"},{"location":"#multiprocessing-enabled-by-default","title":"Multiprocessing enabled by default","text":"<p>Tablite uses numpy whereever possible and applies multiprocessing for bypassing the GIL on all major operations.  CSV import is performed in C through using <code>nim</code>s compiler and is as fast the hardware allows.</p>"},{"location":"#all-algorithms-have-been-reworked-to-respect-memory-limits","title":"All algorithms have been reworked to respect memory limits","text":"<p>Tablite respects the limits of free memory by tagging the free memory and defining task size before each memory intensive task is initiated (join, groupby, data import, etc). If you still run out of memory you may try to reduce the <code>config.Config.PAGE_SIZE</code> and rerun your program.</p>"},{"location":"#100-support-for-all-python-datatypes","title":"100% support for all python datatypes","text":"<p>Tablite wants to make it easy for you to work with data. <code>tablite.Table's</code> behave like a dict with lists:</p> <p><code>my_table[column name] = [... data ...]</code>.</p> <p>Tablite uses datatype mapping to native numpy types where possible and uses type mapping for non-native types such as timedelta, None, date, time\u2026 e.g. what you put in, is what you get out. This is inspired by bank python.</p>"},{"location":"#light-weight","title":"Light weight","text":"<p>Tablite is ~200 kB.</p>"},{"location":"#helpful","title":"Helpful","text":"<p>Tablite wants you to be productive, so a number of helpers are available. </p> <ul> <li><code>Table.import_file</code> to import csv*, tsv, txt, xls, xlsx, xlsm, ods, zip and logs. There is automatic type detection (see tutorial.ipynb )</li> <li>To peek into any supported file use <code>get_headers</code> which shows the first 10 rows.</li> <li>Use <code>mytable.rows</code> and <code>mytable.columns</code> to iterate over rows or columns.</li> <li>Create multi-key <code>.index</code> for quick lookups.</li> <li>Perform multi-key <code>.sort</code>,</li> <li>Filter using <code>.any</code> and <code>.all</code> to select specific rows.</li> <li>use multi-key <code>.lookup</code> and <code>.join</code> to find data across tables.</li> <li>Perform <code>.groupby</code> and reorganise data as a <code>.pivot</code> table with max, min, sum, first, last, count, unique, average, st.deviation, median and mode</li> <li>Append / concatenate tables with <code>+=</code> which automatically sorts out the columns - even if they're not in perfect order.</li> <li>Should you tables be similar but not the identical you can use <code>.stack</code> to \"stack\" tables on top of each other</li> </ul> <p>If you're still missing something add it to the wishlist</p>"},{"location":"#installation","title":"Installation","text":"<p>Get it from pypi: </p> <p>Install: <code>pip install tablite</code> Usage:  <code>&gt;&gt;&gt; from tablite import Table</code> </p>"},{"location":"#build-test","title":"Build &amp; test","text":"<p>install nim &gt;= 2.0.0</p> <p>run: <code>chmod +x ./build_nim.sh</code> run: <code>./build_nim.sh</code></p> <p>Should the default nim not be your desired taste, please use <code>nims</code> environment manager (<code>atlas</code>) and run <code>source nim-2.0.0/activate.sh</code> on UNIX or <code>nim-2.0.0/activate.bat</code> on windows.</p> <pre><code>install python &gt;= 3.8\npython -m venv /your/venv/dir\nactivate /your/venv/dir\npip install -r requirements.txt\npip install -r requirements_for_testing.py\npytest ./tests\n</code></pre>"},{"location":"#feature-overview","title":"Feature overview","text":"want to... this way... loop over rows <code>[ row for row in table.rows ]</code> loop over columns <code>[ table[col_name] for col_name in table.columns ]</code> slice <code>myslice = table['A', 'B', slice(0,None,15)]</code> get column by name <code>my_table['A']</code> get row by index <code>my_table[9_000_000_001]</code> value update <code>mytable['A'][2] = new value</code> update w. list comprehension <code>mytable['A'] = [ x*x for x in mytable['A'] if x % 2 != 0 ]</code> join <code>a_join = numbers.join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'], kind='left')</code> lookup <code>travel_plan = friends.lookup(bustable, (DataTypes.time(21, 10), \"&lt;=\", 'time'), ('stop', \"==\", 'stop'))</code> groupby <code>group_by = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)])</code> pivot table <code>my_pivot = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False)</code> index <code>indices = old_table.index(*old_table.columns)</code> sort <code>lookup1_sorted = lookup_1.sort(**{'time': True, 'name':False, \"sort_mode\":'unix'})</code> filter <code>true, false = unfiltered.filter( [{\"column1\": 'a', \"criteria\":\"&gt;=\", 'value2':3}, ... more criteria ... ], filter_type='all' )</code> find any <code>any_even_rows = mytable.any('A': lambda x : x%2==0, 'B': lambda x &gt; 0)</code> find all <code>all_even_rows = mytable.all('A': lambda x : x%2==0, 'B': lambda x &gt; 0)</code> to json <code>json_str = my_table.to_json()</code> from json <code>Table.from_json(json_str)</code>"},{"location":"#api","title":"API","text":"<p>To view the detailed API see api</p>"},{"location":"#tutorial","title":"Tutorial","text":"<p>To learn more see the tutorial.ipynb (Jupyter notebook)</p>"},{"location":"#latest-updates","title":"Latest updates","text":"<p>See changelog.md</p>"},{"location":"#credits","title":"Credits","text":"<ul> <li>Eugene Antonov - the api documentation.</li> <li>Audrius Kulikajevas - Edge case testing / various bugs, Jupyter notebook integration.</li> <li>Ovidijus Grigas - various bugs, documentation.</li> <li>Martynas Kaunas - GroupBy functionality.</li> <li>Sergej Sinkarenko - various bugs.</li> <li>Lori Cooper - spell checking.</li> </ul>"},{"location":"benchmarks/","title":"Benchmarks","text":"In\u00a0[2]: Copied! <pre>import psutil, os, gc, shutil, tempfile\nfrom pathlib import Path\nfrom time import perf_counter, time\nfrom tablite import Table\nfrom tablite.datasets import synthetic_order_data\nfrom tablite.config import Config\n\nConfig.TQDM_DISABLE = True\n</pre> import psutil, os, gc, shutil, tempfile from pathlib import Path from time import perf_counter, time from tablite import Table from tablite.datasets import synthetic_order_data from tablite.config import Config  Config.TQDM_DISABLE = True In\u00a0[3]: Copied! <pre>process = psutil.Process(os.getpid())\n\ndef make_tables(sizes=[1,2,5,10,20,50]):\n    # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.\n    t = synthetic_order_data(Config.PAGE_SIZE)\n    real, flat = t.nbytes()\n    print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\")\n\n    tables = [t]  # 1M rows.\n\n    last = 1\n    t2 = t.copy()\n    for i in sizes[1:]:\n        t2 = t2.copy()\n        for _ in range(i-last):\n            t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique\n        last = i\n        real, flat = t2.nbytes()\n        tables.append(t2)\n        print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\")\n    return tables\n\ntables = make_tables()\n</pre> process = psutil.Process(os.getpid())  def make_tables(sizes=[1,2,5,10,20,50]):     # The last tables are too big for RAM (~24Gb), so I create subtables of 1M rows and append them.     t = synthetic_order_data(Config.PAGE_SIZE)     real, flat = t.nbytes()     print(f\"Table {len(t):,} rows is {real/1e6:,.0f} Mb on disk\")      tables = [t]  # 1M rows.      last = 1     t2 = t.copy()     for i in sizes[1:]:         t2 = t2.copy()         for _ in range(i-last):             t2 += synthetic_order_data(Config.PAGE_SIZE)  # these are all unique         last = i         real, flat = t2.nbytes()         tables.append(t2)         print(f\"Table {len(t2):,} rows is {real/1e6:,.0f} Mb on disk\")     return tables  tables = make_tables() <pre>Table 1,000,000 rows is 256 Mb on disk\nTable 2,000,000 rows is 512 Mb on disk\nTable 5,000,000 rows is 1,280 Mb on disk\nTable 10,000,000 rows is 2,560 Mb on disk\nTable 20,000,000 rows is 5,120 Mb on disk\nTable 50,000,000 rows is 12,800 Mb on disk\n</pre> <p>The values in the tables above are all unique!</p> In\u00a0[4]: Copied! <pre>tables[-1]\n</pre> tables[-1] Out[4]: ~#1234567891011         0114014953182952021-10-06T00:00:0050814119375C3-4HGQ21\u00b0XYZ1.244647268201734421.367107051830455         129320231372182021-08-26T00:00:005007718568C5-5FZU0\u00b00.55294485347516132.6980406874392537         2312569602250812021-12-21T00:00:0050197029074C2-3GTK6\u00b0XYZ1.99739754559065617.513164305723787         3414012777817432021-08-23T00:00:0050818024969C4-3BYP6\u00b0XYZ0.047497125538289577.388171617130485         459426667674262021-07-31T00:00:0050307113074C5-2CCC21\u00b0ABC1.0219215027612885.21324123446987         5612186131851272021-12-01T00:00:0050484117249C5-4WGT21\u00b00.2038764258434556712.190974436133764         676070424343982021-11-29T00:00:0050578011564C2-3LUL0\u00b0XYZ2.2367835158480444.340628097363572.......................................49,999,9939999946602693775472021-09-17T00:00:005015409706C4-3AHQ21\u00b0XYZ0.083216645843125856.56780297752790549,999,9949999955709798646952021-08-01T00:00:0050149125006C1-2FWH6\u00b01.04763923662266419.50710544462706549,999,9959999963551956078252021-07-29T00:00:0050007026992C4-3GVG21\u00b02.20440816560941411.2706443974284949,999,99699999720762240577282021-10-16T00:00:0050950113339C5-4NKS0\u00b02.1593110498135494.21575620046596149,999,9979999986577247891352021-12-21T00:00:0050069114747C2-4LYGNone1.64809640191698683.094420483625827349,999,9989999999775312438842021-12-02T00:00:0050644129345C2-5DRH6\u00b02.30911421692753110.82706867207146849,999,999100000012290713920652021-08-23T00:00:0050706119732C4-5AGB6\u00b00.488871405593691630.8580085696389939 In\u00a0[5]: Copied! <pre>def save_load_benchmarks(tables):\n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)\n\n    results = Table()\n    results.add_columns('rows', 'save (sec)', 'load (sec)')\n    for t in tables:\n        fn = tmp / f'{len(t)}.tpz'\n        start = perf_counter()\n        t.save(fn)\n        end = perf_counter()\n        save = round(end-start,3)\n        assert fn.exists()\n        \n        \n        start = perf_counter()\n        t2 = Table.load(fn)\n        end = perf_counter()\n        load = round(end-start,3)\n        print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\")\n        del t2\n        fn.unlink()\n        results.add_rows(len(t), save, load)\n    \n    r = results\n    r['save r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ]\n    r['load r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])]\n\n    return results\n</pre> def save_load_benchmarks(tables):     tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)      results = Table()     results.add_columns('rows', 'save (sec)', 'load (sec)')     for t in tables:         fn = tmp / f'{len(t)}.tpz'         start = perf_counter()         t.save(fn)         end = perf_counter()         save = round(end-start,3)         assert fn.exists()                           start = perf_counter()         t2 = Table.load(fn)         end = perf_counter()         load = round(end-start,3)         print(f\"saving {len(t):,} rows ({fn.stat().st_size/1e6:,.0f} Mb) took {save:,.3f} seconds. loading took {load:,.3f} seconds\")         del t2         fn.unlink()         results.add_rows(len(t), save, load)          r = results     r['save r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['save (sec)']) ]     r['load r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['load (sec)'])]      return results  In\u00a0[6]: Copied! <pre>slb = save_load_benchmarks(tables)\n</pre> slb = save_load_benchmarks(tables) <pre>saving 1,000,000 rows (49 Mb) took 2.148 seconds. loading took 0.922 seconds\nsaving 2,000,000 rows (98 Mb) took 4.267 seconds. loading took 1.820 seconds\nsaving 5,000,000 rows (246 Mb) took 10.618 seconds. loading took 4.482 seconds\nsaving 10,000,000 rows (492 Mb) took 21.291 seconds. loading took 8.944 seconds\nsaving 20,000,000 rows (984 Mb) took 42.603 seconds. loading took 17.821 seconds\nsaving 50,000,000 rows (2,461 Mb) took 106.644 seconds. loading took 44.600 seconds\n</pre> In\u00a0[7]: Copied! <pre>slb\n</pre> slb Out[7]: #rowssave (sec)load (sec)save r/secload r/sec 010000002.1480.9224655491084598 120000004.2671.824687131098901 2500000010.6184.4824708981115573 31000000021.2918.9444696821118067 42000000042.60317.8214694501122271 550000000106.64444.64688491121076 <p>With various compression options</p> In\u00a0[8]: Copied! <pre>def save_compression_benchmarks(t):\n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)\n\n    import zipfile  # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile\n    methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")]\n    methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)]\n    methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)]\n\n    results = Table()\n    results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')\n    for level, method, name in methods:\n        fn = tmp / f'{len(t)}.tpz'\n        start = perf_counter()  \n        t.save(fn, compression_method=method, compression_level=level)\n        end = perf_counter()\n        write = round(end-start,3)\n        assert fn.exists()\n        size = int(fn.stat().st_size/1e6)\n        # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='')\n        \n        start = perf_counter()\n        t2 = Table.load(fn)\n        end = perf_counter()\n        read = round(end-start,3)\n        # print(f\" and {end-start:,.3} seconds to load\")\n        print(\".\", end='')\n        \n        del t2\n        fn.unlink()\n        results.add_rows(size, f\"{name}(level={level})\", write, read)\n        \n    \n    r = results\n    r.sort({'write (sec)':True})\n    r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]\n    r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]\n    return results\n</pre> def save_compression_benchmarks(t):     tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)      import zipfile  # https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile     methods = [(None, zipfile.ZIP_STORED, \"zip stored\"), (None, zipfile.ZIP_LZMA, \"zip lzma\")]     methods += [(i, zipfile.ZIP_DEFLATED, \"zip deflated\") for i in range(0,10)]     methods += [(i, zipfile.ZIP_BZIP2, \"zip bzip2\") for i in range(1,10)]      results = Table()     results.add_columns('file size (Mb)', 'method', 'write (sec)', 'read (sec)')     for level, method, name in methods:         fn = tmp / f'{len(t)}.tpz'         start = perf_counter()           t.save(fn, compression_method=method, compression_level=level)         end = perf_counter()         write = round(end-start,3)         assert fn.exists()         size = int(fn.stat().st_size/1e6)         # print(f\"{name}(level={level}): {len(t):,} rows ({size} Mb) took {write:,.3f} secconds to save\", end='')                  start = perf_counter()         t2 = Table.load(fn)         end = perf_counter()         read = round(end-start,3)         # print(f\" and {end-start:,.3} seconds to load\")         print(\".\", end='')                  del t2         fn.unlink()         results.add_rows(size, f\"{name}(level={level})\", write, read)                   r = results     r.sort({'write (sec)':True})     r['write (rps)'] = [int(1_000_000/b) for b in r['write (sec)']]     r['read (rps)'] = [int(1_000_000/b) for b in r['read (sec)']]     return results In\u00a0[9]: Copied! <pre>scb = save_compression_benchmarks(tables[0])\n</pre> scb = save_compression_benchmarks(tables[0]) <pre>.....................</pre> <pre>creating sort index:   0%|          | 0/1 [00:00&lt;?, ?it/s]\rcreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 268.92it/s]\n</pre> In\u00a0[10]: Copied! <pre>scb[0:20]\n</pre> scb[0:20] Out[10]: #file size (Mb)methodwrite (sec)read (sec)write (rps)read (rps) 0256zip stored(level=None)0.3960.47525252522105263 129zip lzma(level=None)95.1372.22810511448833 2256zip deflated(level=0)0.5350.59518691581680672 349zip deflated(level=1)2.150.9224651161084598 447zip deflated(level=2)2.2640.9124416961096491 543zip deflated(level=3)3.0490.833279761204819 644zip deflated(level=4)2.920.8623424651160092 742zip deflated(level=5)4.0340.8692478921150747 840zip deflated(level=6)8.5580.81168491250000 939zip deflated(level=7)13.6950.7787301912853471038zip deflated(level=8)56.9720.7921755212626261138zip deflated(level=9)122.6230.791815512642221229zip bzip2(level=1)15.1214.065661332460021329zip bzip2(level=2)16.0474.214623162373041429zip bzip2(level=3)16.8584.409593192268081529zip bzip2(level=4)17.6485.141566631945141629zip bzip2(level=5)18.6746.009535501664171729zip bzip2(level=6)19.4056.628515331508751829zip bzip2(level=7)19.9546.714501151489421929zip bzip2(level=8)20.5956.96148555143657 <p>Conclusions</p> <ul> <li>Fastest: zip stored with no compression takes handles</li> </ul> In\u00a0[11]: Copied! <pre>def to_sql_benchmark(t, rows=1_000_000):\n    t2 = t[:rows]\n    write_start = time()\n    _ = t2.to_sql(name='1')\n    write_end = time()\n    write = round(write_end-write_start,3)\n    return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" )\n</pre> def to_sql_benchmark(t, rows=1_000_000):     t2 = t[:rows]     write_start = time()     _ = t2.to_sql(name='1')     write_end = time()     write = round(write_end-write_start,3)     return ( t.to_sql.__name__, write, 0, len(t2), \"\" , \"\" )  In\u00a0[12]: Copied! <pre>def to_json_benchmark(t, rows=1_000_000):\n    t2 = t[:rows]\n\n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)\n    path = tmp / \"1.json\" \n    \n    write_start = time()\n    bytestr = t2.to_json()\n    with path.open('w') as fo:\n        fo.write(bytestr)\n    write_end = time()\n    write = round(write_end-write_start,3)\n\n    read_start = time()\n    with path.open('r') as fi:\n        _ = Table.from_json(fi.read())  # &lt;-- JSON\n    read_end = time()\n    read = round(read_end-read_start,3)\n\n    return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" )\n</pre> def to_json_benchmark(t, rows=1_000_000):     t2 = t[:rows]      tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)     path = tmp / \"1.json\"           write_start = time()     bytestr = t2.to_json()     with path.open('w') as fo:         fo.write(bytestr)     write_end = time()     write = round(write_end-write_start,3)      read_start = time()     with path.open('r') as fi:         _ = Table.from_json(fi.read())  # &lt;-- JSON     read_end = time()     read = round(read_end-read_start,3)      return ( t.to_json.__name__, write, read, len(t2), int(path.stat().st_size/1e6), \"\" )  In\u00a0[13]: Copied! <pre>def f(t, args):\n    rows, c1, c1_kw, c2, c2_kw = args\n    t2 = t[:rows]\n\n    call = getattr(t2, c1)\n    assert callable(call)\n\n    write_start = time()\n    call(**c1_kw)\n    write_end = time()\n    write = round(write_end-write_start,3)\n\n    for _ in range(10):\n        gc.collect()\n\n    read_start = time()\n    if callable(c2):\n        c2(**c2_kw)\n    read_end = time()\n    read = round(read_end-read_start,3)\n\n    fn = c2_kw['path']\n    assert fn.exists()\n    fs = int(fn.stat().st_size/1e6)\n    config = {k:v for k,v in c2_kw.items() if k!= 'path'}\n\n    return ( c1, write, read, len(t2), fs , str(config))\n</pre> def f(t, args):     rows, c1, c1_kw, c2, c2_kw = args     t2 = t[:rows]      call = getattr(t2, c1)     assert callable(call)      write_start = time()     call(**c1_kw)     write_end = time()     write = round(write_end-write_start,3)      for _ in range(10):         gc.collect()      read_start = time()     if callable(c2):         c2(**c2_kw)     read_end = time()     read = round(read_end-read_start,3)      fn = c2_kw['path']     assert fn.exists()     fs = int(fn.stat().st_size/1e6)     config = {k:v for k,v in c2_kw.items() if k!= 'path'}      return ( c1, write, read, len(t2), fs , str(config))  In\u00a0[14]: Copied! <pre>def import_export_benchmarks(tables):\n    Config.PROCESSING_MODE = Config.FALSE\n        \n    t = sorted(tables, key=lambda x: len(x), reverse=True)[0]\n    \n    tmp = Path(tempfile.gettempdir()) / \"junk\"\n    tmp.mkdir(exist_ok=True)   \n\n    args = [\n        (   100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}),\n        (    50_000,  \"to_ods\",  {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ),  # 50k rows, otherwise MemoryError.\n        ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'}                           ),\n        ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n        (10_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),\n        ( 1_000_000,  \"to_tsv\",  {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'}                           ),\n        ( 1_000_000, \"to_text\",  {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'}                           ),\n        ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'}                          ),\n        ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'}                          )\n    ]\n\n    results = Table()\n    results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')\n\n    results.add_rows( to_sql_benchmark(t) )\n    results.add_rows( to_json_benchmark(t) )\n\n    for arg in args:\n        if len(t)&lt;arg[0]:\n            continue\n        print(\".\", end='')\n        try:\n            results.add_rows( f(t, arg) )\n        except MemoryError:\n            results.add_rows( arg[1], \"Memory Error\", \"NIL\", args[0], \"NIL\", \"N/A\")\n    \n    r = results\n    r['read r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['read (s)']) ]\n    r['write r/sec'] = [int(a/b) if b!=0  else \"nil\" for a,b in zip(r['rows'], r['write (s)'])]\n\n    shutil.rmtree(tmp)\n    return results\n</pre> def import_export_benchmarks(tables):     Config.PROCESSING_MODE = Config.FALSE              t = sorted(tables, key=lambda x: len(x), reverse=True)[0]          tmp = Path(tempfile.gettempdir()) / \"junk\"     tmp.mkdir(exist_ok=True)         args = [         (   100_000, \"to_xlsx\", {'path': tmp/'1.xlsx'}, Table.from_file, {\"path\":tmp/'1.xlsx', \"sheet\":\"pyexcel_sheet1\"}),         (    50_000,  \"to_ods\",  {'path': tmp/'1.ods'}, Table.from_file, {\"path\":tmp/'1.ods', \"sheet\":\"pyexcel_sheet1\"} ),  # 50k rows, otherwise MemoryError.         ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv'}                           ),         ( 1_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),         (10_000_000,  \"to_csv\",  {'path': tmp/'1.csv'}, Table.from_file, {\"path\":tmp/'1.csv', \"guess_datatypes\":False}),         ( 1_000_000,  \"to_tsv\",  {'path': tmp/'1.tsv'}, Table.from_file, {\"path\":tmp/'1.tsv'}                           ),         ( 1_000_000, \"to_text\",  {'path': tmp/'1.txt'}, Table.from_file, {\"path\":tmp/'1.txt'}                           ),         ( 1_000_000, \"to_html\", {'path': tmp/'1.html'}, Table.from_file, {\"path\":tmp/'1.html'}                          ),         ( 1_000_000, \"to_hdf5\", {'path': tmp/'1.hdf5'}, Table.from_file, {\"path\":tmp/'1.hdf5'}                          )     ]      results = Table()     results.add_columns('method', 'write (s)', 'read (s)', 'rows', 'size (Mb)', 'config')      results.add_rows( to_sql_benchmark(t) )     results.add_rows( to_json_benchmark(t) )      for arg in args:         if len(t) In\u00a0[15]: Copied! <pre>ieb = import_export_benchmarks(tables)\n</pre> ieb = import_export_benchmarks(tables) <pre>.........writing 12,000,000 records to /tmp/junk/1.hdf5... done\n</pre> In\u00a0[16]: Copied! <pre>ieb\n</pre> ieb Out[16]: #methodwrite (s)read (s)rowssize (Mb)configread r/secwrite r/sec 0to_sql12.34501000000nil81004 1to_json10.8144.406100000014222696392472 2to_xlsx10.56921.5721000009{'sheet': 'pyexcel_sheet1'}46359461 3to_ods29.17529.487500003{'sheet': 'pyexcel_sheet1'}16951713 4to_csv14.31515.7311000000108{}6356869856 5to_csv14.4388.1691000000108{'guess_datatypes': False}12241469261 6to_csv140.64599.45100000001080{'guess_datatypes': False}10055371100 7to_tsv13.83415.7631000000108{}6343972285 8to_text13.93715.6821000000108{}6376771751 9to_html12.5780.531000000228{}18867927950310to_hdf55.0112.3451000000316{}81004199600 <p>Conclusions</p> <p>Best:</p> <ul> <li>to/from JSON wins with 2.3M rps read</li> <li>to/from CSV/TSV/TEXT comes 2nd with config <code>guess_datatypes=False</code> with ~ 100k rps</li> </ul> <p>Worst:</p> <ul> <li>to/from ods burst the memory footprint and hence had to be reduced to 100k rows. It also had the slowest read rate with 1450 rps.</li> </ul> In\u00a0[17]: Copied! <pre>def contains_benchmark(table):\n    results = Table()\n    results.add_columns( \"column\", \"time (s)\" )\n    for name,col in table.columns.items():\n        n = len(col)\n        start,stop,step = int(n*0.02), int(n*0.98), int(n/100)\n        selection = col[start:stop:step]\n        total_time = 0.0\n        for v in selection:\n            start_time = perf_counter()\n            v in col  # &lt;--- test!\n            end_time = perf_counter()\n            total_time += (end_time - start_time)\n        avg_time = total_time / len(selection)\n        results.add_rows( name, round(avg_time,3) )\n\n    return results\n</pre> def contains_benchmark(table):     results = Table()     results.add_columns( \"column\", \"time (s)\" )     for name,col in table.columns.items():         n = len(col)         start,stop,step = int(n*0.02), int(n*0.98), int(n/100)         selection = col[start:stop:step]         total_time = 0.0         for v in selection:             start_time = perf_counter()             v in col  # &lt;--- test!             end_time = perf_counter()             total_time += (end_time - start_time)         avg_time = total_time / len(selection)         results.add_rows( name, round(avg_time,3) )      return results      In\u00a0[18]: Copied! <pre>has_it = contains_benchmark(tables[-1])\nhas_it\n</pre> has_it = contains_benchmark(tables[-1]) has_it Out[18]: #columntime (s) 0#0.001 110.043 220.032 330.001 440.001 550.001 660.006 770.003 880.006 990.00710100.04311110.655 In\u00a0[19]: Copied! <pre>def slicing_benchmark(table):\n    n = len(table)\n    start,stop,step = int(0.02*n), int(0.98*n), int(n / 20)  # from 2% to 98% in 20 large steps\n    start_time = perf_counter()\n    snip = table[start:stop:step]\n    end_time = perf_counter()\n    print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\")\n    return snip\n</pre> def slicing_benchmark(table):     n = len(table)     start,stop,step = int(0.02*n), int(0.98*n), int(n / 20)  # from 2% to 98% in 20 large steps     start_time = perf_counter()     snip = table[start:stop:step]     end_time = perf_counter()     print(f\"reading {len(table):,} rows to find {len(snip):,} rows took {end_time-start_time:.3f} sec\")     return snip In\u00a0[20]: Copied! <pre>slice_it = slicing_benchmark(tables[-1])\n</pre> slice_it = slicing_benchmark(tables[-1]) <pre>reading 50,000,000 rows to find 20 rows took 1.435 sec\n</pre> In\u00a0[22]: Copied! <pre>def column_selection_benchmark(tables):\n    results = Table()\n    results.add_columns( 'rows')\n    results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)])\n\n    for table in tables:\n        rr = [len(table)]\n        for ix, name in enumerate(table.columns):\n            cols = list(table.columns)[:ix+1]\n            start_time = perf_counter()\n            table[cols]\n            end_time = perf_counter()\n            rr.append(f\"{end_time-start_time:.5f}\")\n        results.add_rows( rr )\n    return results\n</pre> def column_selection_benchmark(tables):     results = Table()     results.add_columns( 'rows')     results.add_columns(*[f\"n cols={i}\" for i,_ in enumerate(tables[0].columns,start=1)])      for table in tables:         rr = [len(table)]         for ix, name in enumerate(table.columns):             cols = list(table.columns)[:ix+1]             start_time = perf_counter()             table[cols]             end_time = perf_counter()             rr.append(f\"{end_time-start_time:.5f}\")         results.add_rows( rr )     return results In\u00a0[23]: Copied! <pre>csb = column_selection_benchmark(tables)\nprint(\"times below are are in seconds\")\ncsb\n</pre> csb = column_selection_benchmark(tables) print(\"times below are are in seconds\") csb <pre>times below are are in seconds\n</pre> Out[23]: #rowsn cols=1n cols=2n cols=3n cols=4n cols=5n cols=6n cols=7n cols=8n cols=9n cols=10n cols=11n cols=12 010000000.000010.000060.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 120000000.000010.000080.000030.000030.000030.000030.000030.000030.000030.000030.000040.00004 250000000.000010.000050.000040.000040.000040.000040.000040.000040.000040.000040.000040.00004 3100000000.000020.000050.000040.000040.000040.000040.000070.000050.000050.000050.000050.00005 4200000000.000030.000060.000050.000050.000050.000050.000060.000060.000060.000060.000060.00006 5500000000.000090.000110.000100.000090.000090.000090.000090.000090.000090.000090.000100.00009 In\u00a0[33]: Copied! <pre>def iterrows_benchmark(table):\n    results = Table()\n    results.add_columns( 'n columns', 'time (s)')\n\n    columns = ['1']\n    for column in list(table.columns):\n        columns.append(column)\n        snip = table[columns, slice(500_000,1_500_000)]\n        start_time = perf_counter()\n        counts = 0\n        for row in snip.rows:\n            counts += 1\n        end_time = perf_counter()\n        results.add_rows( len(columns), round(end_time-start_time,3))\n\n    return results\n</pre> def iterrows_benchmark(table):     results = Table()     results.add_columns( 'n columns', 'time (s)')      columns = ['1']     for column in list(table.columns):         columns.append(column)         snip = table[columns, slice(500_000,1_500_000)]         start_time = perf_counter()         counts = 0         for row in snip.rows:             counts += 1         end_time = perf_counter()         results.add_rows( len(columns), round(end_time-start_time,3))      return results  In\u00a0[34]: Copied! <pre>iterb = iterrows_benchmark(tables[-1])\niterb\n</pre> iterb = iterrows_benchmark(tables[-1]) iterb Out[34]: #n columnstime (s) 029.951 139.816 249.859 359.93 469.985 579.942 689.958 799.867 8109.96 9119.93210129.8311139.861 In\u00a0[35]: Copied! <pre>import matplotlib.pyplot as plt\nplt.plot(iterb['n columns'], iterb['time (s)'])\nplt.show()\n</pre> import matplotlib.pyplot as plt plt.plot(iterb['n columns'], iterb['time (s)']) plt.show()  In\u00a0[28]: Copied! <pre>tables[-1].types()\n</pre> tables[-1].types() Out[28]: <pre>{'#': {int: 50000000},\n '1': {int: 50000000},\n '2': {str: 50000000},\n '3': {int: 50000000},\n '4': {int: 50000000},\n '5': {int: 50000000},\n '6': {str: 50000000},\n '7': {str: 50000000},\n '8': {str: 50000000},\n '9': {str: 50000000},\n '10': {float: 50000000},\n '11': {str: 50000000}}</pre> In\u00a0[29]: Copied! <pre>def dtypes_benchmark(tables):\n    dtypes_results = Table()\n    dtypes_results.add_columns(\"rows\", \"time (s)\")\n\n    for table in tables:\n        start_time = perf_counter()\n        dt = table.types()\n        end_time = perf_counter()\n        assert isinstance(dt, dict) and len(dt) != 0\n        dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )\n\n    return dtypes_results\n</pre> def dtypes_benchmark(tables):     dtypes_results = Table()     dtypes_results.add_columns(\"rows\", \"time (s)\")      for table in tables:         start_time = perf_counter()         dt = table.types()         end_time = perf_counter()         assert isinstance(dt, dict) and len(dt) != 0         dtypes_results.add_rows( len(table), round(end_time-start_time, 3) )      return dtypes_results In\u00a0[30]: Copied! <pre>dtype_b = dtypes_benchmark(tables)\ndtype_b\n</pre> dtype_b = dtypes_benchmark(tables) dtype_b Out[30]: #rowstime (s) 010000000.0 120000000.0 250000000.0 3100000000.0 4200000000.0 5500000000.001 In\u00a0[31]: Copied! <pre>def any_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n\n    for table in tables:\n        tmp = [len(table)]\n        for column in list(table.columns):\n            v = table[column][0]\n            start_time = perf_counter()\n            _ = table.any(**{column: v})\n            end_time = perf_counter()           \n            tmp.append(round(end_time-start_time,3))\n\n        results.add_rows( tmp )\n    return results\n</pre> def any_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))      for table in tables:         tmp = [len(table)]         for column in list(table.columns):             v = table[column][0]             start_time = perf_counter()             _ = table.any(**{column: v})             end_time = perf_counter()                        tmp.append(round(end_time-start_time,3))          results.add_rows( tmp )     return results  In\u00a0[32]: Copied! <pre>anyb = any_benchmark(tables)\nanyb\n</pre>  anyb = any_benchmark(tables) anyb Out[32]: ~rows#1234567891011 010000000.1330.1330.1780.1330.2920.1470.1690.1430.2270.2590.1460.17 120000000.2680.2630.3430.2650.5670.2940.3350.2750.4640.5230.2890.323 250000000.6690.6530.9140.6691.4360.7230.8380.6941.1741.3350.6780.818 3100000001.3141.351.7451.3362.9021.491.6831.4142.3542.6181.3431.536 4200000002.5562.5343.3372.6025.6452.8273.2252.6464.5145.082.6933.083 5500000006.5716.4238.4556.69914.4847.9897.7986.25910.98912.486.7327.767 In\u00a0[36]: Copied! <pre>def all_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n\n    for table in tables:\n        tmp = [len(table)]\n        for column in list(table.columns):\n            v = table[column][0]\n            start_time = perf_counter()\n            _ = table.all(**{column: v})\n            end_time = perf_counter()           \n            tmp.append(round(end_time-start_time,3))\n\n        results.add_rows( tmp )\n    return results\n</pre> def all_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))      for table in tables:         tmp = [len(table)]         for column in list(table.columns):             v = table[column][0]             start_time = perf_counter()             _ = table.all(**{column: v})             end_time = perf_counter()                        tmp.append(round(end_time-start_time,3))          results.add_rows( tmp )     return results  In\u00a0[37]: Copied! <pre>allb = all_benchmark(tables)\nallb\n</pre> allb = all_benchmark(tables) allb Out[37]: ~rows#1234567891011 010000000.120.1210.1620.1220.2640.1380.1550.1270.2090.2370.1330.151 120000000.2370.2350.3110.2380.520.2660.2970.3410.4510.530.2610.285 250000000.6750.6980.9520.5941.6050.6590.8120.7191.2241.3530.6640.914 3100000001.3141.3321.7071.3323.0911.4631.7811.3662.3582.6381.4091.714 4200000002.5762.3133.112.3965.2072.5732.9212.4034.0414.6582.4632.808 5500000005.8965.827.735.95612.9097.457.275.98110.18311.5766.3727.414 In\u00a0[\u00a0]: Copied! <pre>\n</pre> In\u00a0[38]: Copied! <pre>def unique_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n    \n    for table in tables:\n        length = len(table)\n\n        tmp = [len(table)]\n        for column in list(table.columns):\n            start_time = perf_counter()\n            try:\n                L = table[column].unique()\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            tmp.append(round(dt,3))\n            assert 0 &lt; len(L) &lt;= length    \n\n        results.add_rows( tmp )\n    return results\n</pre> def unique_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))          for table in tables:         length = len(table)          tmp = [len(table)]         for column in list(table.columns):             start_time = perf_counter()             try:                 L = table[column].unique()                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             tmp.append(round(dt,3))             assert 0 &lt; len(L) &lt;= length              results.add_rows( tmp )     return results In\u00a0[39]: Copied! <pre>ubm = unique_benchmark(tables)\nubm\n</pre> ubm = unique_benchmark(tables) ubm Out[39]: ~rows#1234567891011 010000000.0220.0810.2480.0440.0160.0610.1150.1360.0960.0850.0940.447 120000000.1760.2710.5050.0870.0310.1240.2290.2790.1980.170.3051.471 250000000.1980.4991.2630.2180.0760.3110.570.6850.4740.4250.5952.744 3100000000.5021.1232.5350.4330.1550.6151.1281.3750.960.851.3165.826 4200000000.9562.3365.0350.8830.3191.2292.2682.7481.9131.7462.73311.883 5500000002.3956.01912.4992.1780.7643.0735.6086.8194.8284.2797.09730.511 In\u00a0[40]: Copied! <pre>def index_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n    \n    for table in tables:\n\n        tmp = [len(table)]\n        for column in list(table.columns):\n            start_time = perf_counter()\n            try:\n                _ = table.index(column)\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            tmp.append(round(dt,3))\n            \n        results.add_rows( tmp )\n    return results\n</pre> def index_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))          for table in tables:          tmp = [len(table)]         for column in list(table.columns):             start_time = perf_counter()             try:                 _ = table.index(column)                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             tmp.append(round(dt,3))                      results.add_rows( tmp )     return results   In\u00a0[41]: Copied! <pre>ibm = index_benchmark(tables)\nibm\n</pre> ibm = index_benchmark(tables) ibm Out[41]: ~rows#1234567891011 010000001.9491.7931.4321.1061.0511.231.3381.4931.4111.3031.9992.325 120000002.8833.5172.8562.2172.1242.4622.6762.9862.7092.6064.0494.461 250000006.3829.0497.0965.6285.3536.3126.6497.5216.716.45910.2710.747 31000000012.55318.50613.9511.33510.72412.50913.3315.05113.50212.89919.76921.999 42000000024.71737.89628.56822.66621.47226.32727.15730.06427.33225.82238.31143.399 55000000063.01697.07772.00755.60954.09961.79768.23675.0769.02266.15299.183109.969 <p>Multi-column index next:</p> In\u00a0[42]: Copied! <pre>def multi_column_index_benchmark(tables):\n    \n    selection = [\"4\", \"7\", \"8\", \"9\"]\n    results = Table()\n    results.add_columns(\"rows\", *range(1,len(selection)+1))\n    \n    for table in tables:\n\n        tmp = [len(table)]\n        for index in range(1,5):\n            start_time = perf_counter()\n            try:\n                _ = table.index(*selection[:index])\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            tmp.append(round(dt,3))\n            print('.', end='')\n            \n        results.add_rows( tmp )\n    return results\n</pre> def multi_column_index_benchmark(tables):          selection = [\"4\", \"7\", \"8\", \"9\"]     results = Table()     results.add_columns(\"rows\", *range(1,len(selection)+1))          for table in tables:          tmp = [len(table)]         for index in range(1,5):             start_time = perf_counter()             try:                 _ = table.index(*selection[:index])                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             tmp.append(round(dt,3))             print('.', end='')                      results.add_rows( tmp )     return results   In\u00a0[43]: Copied! <pre>mcib = multi_column_index_benchmark(tables)\nmcib\n</pre> mcib = multi_column_index_benchmark(tables) mcib <pre>........................</pre> Out[43]: #rows1234 010000001.0582.1333.2154.052 120000002.124.2786.5468.328 250000005.30310.8916.69320.793 31000000010.58122.40733.46241.91 42000000021.06445.95467.78184.828 55000000052.347109.551166.6211.053 In\u00a0[44]: Copied! <pre>def drop_duplicates_benchmark(tables):\n    results = Table()\n    results.add_columns(\"rows\", *list(tables[0].columns))\n    \n    for table in tables:\n        result = [len(table)]\n        cols = []\n        for name in list(table.columns):\n            cols.append(name)\n            start_time = perf_counter()\n            try:\n                _ = table.drop_duplicates(*cols)\n                dt = perf_counter() - start_time\n            except MemoryError:\n                dt = -1\n            result.append(round(dt,3))\n            print('.', end='')\n        \n        results.add_rows( result )\n    return results\n</pre> def drop_duplicates_benchmark(tables):     results = Table()     results.add_columns(\"rows\", *list(tables[0].columns))          for table in tables:         result = [len(table)]         cols = []         for name in list(table.columns):             cols.append(name)             start_time = perf_counter()             try:                 _ = table.drop_duplicates(*cols)                 dt = perf_counter() - start_time             except MemoryError:                 dt = -1             result.append(round(dt,3))             print('.', end='')                  results.add_rows( result )     return results   In\u00a0[45]: Copied! <pre>ddb = drop_duplicates_benchmark(tables)\nddb\n</pre> ddb = drop_duplicates_benchmark(tables) ddb <pre>........................................................................</pre> Out[45]: ~rows#1234567891011 010000001.7612.3583.3133.9014.6154.9615.8356.5347.4548.1088.8039.682 120000003.0114.936.9347.979.26410.26812.00613.51714.9216.63117.93219.493 250000006.82713.85318.63721.23724.54827.1131.15735.02638.99243.53146.02250.433 31000000013.23831.74641.14146.91753.17258.24167.99274.65182.7491.45897.666104.82 42000000025.93277.75100.34109.314123.514131.874148.432163.57179.121196.047208.686228.059 55000000064.237312.222364.886388.249429.724466.685494.418535.367581.666607.306634.343683.858"},{"location":"benchmarks/#benchmarks","title":"Benchmarks\u00b6","text":"<p>These benchmarks seek to establish the performance of tablite as a user sees it.</p> <p>Overview</p> Input/Output Various column functions Base functions Core functions - Save / Load .tpz format- Save tables to various formats- Import data from various formats - Setitem / getitem- iter- equal, not equal- copy- t += t- t *= t- contains- remove all- replace- index- unique- histogram- statistics- count - Setitem / getitem- iter / rows- equal, not equal- load- save- copy- stack- types- display_dict- show- to_dict- as_json_serializable- index - expression- filter- sort_index- reindex- drop_duplicates- sort- is_sorted- any- all- drop - replace- groupby- pivot- joins- lookup- replace missing values- transpose- pivot_transpose- diff"},{"location":"benchmarks/#input-output","title":"Input / Output\u00b6","text":""},{"location":"benchmarks/#create-tables-from-synthetic-data","title":"Create tables from synthetic data.\u00b6","text":""},{"location":"benchmarks/#save-load-tpz-format","title":"Save / Load .tpz format\u00b6","text":"<p>Without default compression settings (10% slower than uncompressed, 20% of uncompressed filesize)</p>"},{"location":"benchmarks/#save-load-tables-to-from-various-formats","title":"Save / load tables to / from various formats\u00b6","text":"<p>The handlers for saving / export are:</p> <ul> <li>to_sql</li> <li>to_json</li> <li>to_xls</li> <li>to_ods</li> <li>to_csv</li> <li>to_tsv</li> <li>to_text</li> <li>to_html</li> <li>to_hdf5</li> </ul>"},{"location":"benchmarks/#various-column-functions","title":"Various column functions\u00b6","text":"<ul> <li>Setitem / getitem</li> <li>iter</li> <li>equal, not equal</li> <li>copy</li> <li>t += t</li> <li>t *= t</li> <li>contains</li> <li>remove all</li> <li>replace</li> <li>index</li> <li>unique</li> <li>histogram</li> <li>statistics</li> <li>count</li> </ul>"},{"location":"benchmarks/#various-table-functions","title":"Various table functions\u00b6","text":""},{"location":"benchmarks/#slicing","title":"Slicing\u00b6","text":"<p>Slicing operations are used in many places.</p>"},{"location":"benchmarks/#tabletypes","title":"Table.types()\u00b6","text":"<p>Table.types() is implemented for near constant speed lookup.</p> <p>Here is an example:</p>"},{"location":"benchmarks/#tableany","title":"Table.any\u00b6","text":""},{"location":"benchmarks/#tableall","title":"Table.all\u00b6","text":""},{"location":"benchmarks/#tablefilter","title":"Table.filter\u00b6","text":""},{"location":"benchmarks/#tableunique","title":"Table.unique\u00b6","text":""},{"location":"benchmarks/#tableindex","title":"Table.index\u00b6","text":"<p>Single column index first:</p>"},{"location":"benchmarks/#drop-duplicates","title":"drop duplicates\u00b6","text":""},{"location":"changelog/","title":"Changelog","text":"Version Change 2023.9.0 Adding <code>Table.match</code> operation. 2023.8.0 Nim backend for csv importer.Improve excel importer.Improve slicing consistency.Logical cores re-enabled on *nix based systems.Filter is now type safe.Added merge utility.Various bugfixes. 2023.6.5 Fix issues with <code>get_headers</code> falling back to text reading when reading 0 lines of excel, fix issue where reading excel file would ignore file count, excel file reader now has parity for linecount selection. 2023.6.4 Fix a logic bug in <code>get_headers</code> that caused one extra line to be returned than requested. 2023.6.3 Updated the way reference counting works. Tablite now tracks references to used pages and cleans them up based on number of references to those pages in the current process. This change allows to handle deep table clones when sending tables via processes (pickling/unpickling), whereas previous implementation would corrupt all tables using same pages due to reference counting asserting that all tables are shallow copies to the same object. 2023.6.2 Updated <code>mplite</code> dependency, changed to soft version requirement to prevent pipeline freezes due to small bugfixes in <code>mplite</code>. 2023.6.1 Major change of the backend processes. Speed up of ~6x. For more see the release notes 2022.11.19 Fixed some memory leaks. 2022.11.18 <code>copy</code>, <code>filter</code>, <code>sort</code>, <code>any</code>, <code>all</code> methods now properly respects the table subclass.Filter for tables with under <code>SINGLE_PROCESSING_LIMIT</code> rows will run on same process to reduce overhead.Errors within child processes now properly propagate to parent.<code>Table.reset_storage(include_imports=True)</code> now allows the user to reset the storage but exclude any imported files by setting <code>include_imports=False</code> during <code>Table.reset(...)</code>.Bug: A column with <code>1,None,2</code> would be written to csv &amp; tsv as <code>\"1,None,2\"</code>. Now it is written <code>\"1,,2\"</code> where None means absent.Fix mp <code>join</code> producing mismatched columns lengths when different table lengths are used as an input or when join product is longer than the input table. 2022.11.17 <code>Table.load</code> now properly subclassess the table instead of always resulting in <code>tablite.Table</code>.<code>Table.from_*</code> methods now respect subclassess, fixed some <code>from_*</code> methods which were instance methods and not class methods.Fixed <code>Table.from_dict</code> only accepting <code>list</code> and <code>tuple</code> but not <code>tablite.Column</code> which is an equally valid type.Fix <code>lookup</code> parity in single process and multiple process outputs.Fix an issue with multiprocess <code>lookup</code> where no matches would throw instead of producing <code>None</code>.Fix an issue with filtering an empty table. 2022.11.16 Changed <code>join</code> to process 1M rows per task to avoid potential OOM on lower memory systems. Added <code>mp_merge_columns</code> to <code>MemoryManager</code> that merges column pages into a single column.Fix <code>join</code> parity in single process and multiple process outputs.Fix an issue with multiprocess <code>join</code> where no matches would throw instead of producing <code>None</code>. 2022.11.15 Bump <code>mplite</code> to avoid deadlock issues OS kill the process. 2022.11.14 Improve locking mechanism to allow retries when opening file as the previous solution could cause deadlocks when running multiple threads. 2022.11.13 Fix an issue with copying empty pages. 2022.11.12 Tablite now is now able to create it's own temporary directory. 2022.11.11 <code>text_reader</code> tqdm tracks the entire process now.  <code>text_reader</code> properly respects free memory in *nix based systems.  <code>text_reader</code> no longer discriminates against hyperthreaded cores. 2022.11.10 <code>get_headers</code> now uses plain <code>openpyxl</code> instead of <code>pyexcel</code> wrapper to speed up fetch times ~10x on certain files. 2022.11.9 <code>get_headers</code> can fail safe on unrecognized characters. 2022.11.8 Fix a bug with task size calculation on single core systems. 2022.11.7 Added <code>TABLITE_TMPDIR</code> environment variable for setting tablite work directory.  Characters that fail to be read text reader due to improper encoding will be skipped.  Fixed an issue where single column text files with no column delimiters would be imported as empty tables. 2022.11.6 Date inference fix 2022.11.5 Fixed negative slicing issues 2022.11.4 Transpose API changes:  <code>table.transpose(...)</code> was renamed to <code>table.pivot_transpose(...)</code>  new <code>table.transpose()</code> and <code>table.T</code> were added, it's functionality acts similarly to <code>numpy.T</code>, the column headers are used the first row in the table when transposing. 2022.11.3 Bugfix for non-ascii encoded strings during <code>t.add_rows(...)</code> 2022.11.2 As <code>utf-8</code> is ascii compatible, the file reader utils selects <code>utf-8</code> instead of <code>ascii</code> as a default. 2022.11.1 bugfix in <code>datatypes.infer()</code> where 1 was inferred as int, not float. 2022.11.0 New table features: <code>Table.diff(other, columns=...)</code>, <code>table.remove_duplicates_rows()</code>, <code>table.drop_na(*arg)</code>,<code>table.replace(target,replacement)</code>, <code>table.imputation(sources, targets, methods=...)</code>, <code>table.to_pandas()</code> and <code>Table.from_pandas(pd.DataFrame)</code>,<code>table.to_dict(columns, slice)</code>, <code>Table.from_dict()</code>,<code>table.transpose(columns, keep, ...)</code>, New column features:  <code>Column.count(item)</code>, <code>Column[:]</code> is guaranteed to return a python list.<code>Column.to_numpy(slice)</code> returns <code>np.ndarray</code>.  new <code>tools</code> library: <code>from tablite import tools</code> with:  <code>date_range(start,end)</code>, <code>xround(value, multiple, up=None)</code>, and,  <code>guess</code> as short-cut for <code>Datatypes.guess(...)</code>. bugfixes:  <code>__eq__</code> was updated but missed <code>__ne__</code>.<code>in</code> operator in filter would crash if datatypes were not strings. 2022.10.11 filter now accepts any expression (str) that can be compiled by pythons compiler 2022.10.11 Bugfix for <code>.any</code> and <code>.all</code>. The code now executes much faster 2022.10.10 Bugfix for <code>Table.import_file</code>: <code>import_as</code> has been removed from keywords. 2022.10.10 All Table functions now have tqdm progressbar. 2022.10.10 More robust calculation for task size for multiprocessing. 2022.10.10 Dependency update: mplite==1.2.0 is now required. 2022.10.9 Bugfix for <code>Table.import_file</code>: files with duplicate header names would only have last duplicate name imported.Now the headers are made unique using <code>name_x</code> where x is a number. 2022.10.8 Bugfix for groupby: Where keys are empty error should have been raised.Where there are no functions, unique keypairs are returned. 2022.10.7 Bugfix for Column.statistics() for an empty column 2022.10.6 Bugfix for <code>__setitem__</code>: tbl['a'] = [] is now seen as <code>tbl.add_column('a')</code>Bugfix for <code>__getitem__</code>: calling a missing key raises keyerror. 2022.10.5 Bugfix for summary statistics. 2022.10.4 Bugfix for join shortcut. 2022.10.3 Bugfix for DataTypes where bool was evaluated wrongly 2022.10.0 Added ability to reindex in <code>table.reindex(index=[0,1...,n,n-1])</code> 2022.9.0 Added ability to store python objects (example).Added warning when user iterates over non-rectangular dataset. 2022.8.0 Added <code>table.export(path)</code> which exports tablite Tables to file format given by the file extension. For example <code>my_table.export('example.xlsx')</code>.supported formats are: <code>json</code>, <code>html</code>, <code>xlsx</code>, <code>xls</code>, <code>csv</code>, <code>tsv</code>, <code>txt</code>, <code>ods</code> and <code>sql</code>. 2022.7.8 Added ability to forward <code>tqdm</code> progressbar into <code>Table.import_file(..., tqdm=your_tqdm)</code>, so that Jupyter notebook can use it in <code>display</code>-methods. 2022.7.7 Added method <code>Table.to_sql()</code> for export to ANSI-92 SQL enginesBugfix on to_json for <code>timedelta</code>. Jupyter notebook provides nice view using <code>Table._repr_html_()</code> JS-users can use <code>.as_json_serializable</code> where suitable. 2022.7.6 get_headers now takes argument <code>(path, linecount=10)</code> 2022.7.5 added helper <code>Table.as_json_serializable</code> as Jupyterkernel compat. 2022.7.4 adder helper <code>Table.to_dict</code>, and updated <code>Table.to_json</code> 2022.7.3 table.to_json now takes kwargs: <code>row_count</code>, <code>columns</code>, <code>slice_</code>, <code>start_on</code> 2022.7.2 documentation update. 2022.7.1 minor bugfix. 2022.7.0 BREAKING CHANGES- Tablite now uses HDF5 as backend. - Has multiprocessing enabled by default. - Is 20x faster. - Completely new API. 2022.6.0 <code>DataTypes.guess([list of strings])</code> returns the best matching python datatype."},{"location":"tutorial/","title":"Tutorial","text":"In\u00a0[1]: Copied! <pre>from tablite import Table\n\n## To create a tablite table is as simple as populating a dictionary:\nt = Table({'A':[1,2,3], 'B':['a','b','c']})\n</pre> from tablite import Table  ## To create a tablite table is as simple as populating a dictionary: t = Table({'A':[1,2,3], 'B':['a','b','c']}) In\u00a0[2]: Copied! <pre>## In this notebook we can show tables in the HTML style:\nt\n</pre> ## In this notebook we can show tables in the HTML style: t Out[2]: #AB 01a 12b 23c In\u00a0[3]: Copied! <pre>## or the ascii style:\nt.show()\n</pre> ## or the ascii style: t.show() <pre>+==+=+=+\n|# |A|B|\n+--+-+-+\n| 0|1|a|\n| 1|2|b|\n| 2|3|c|\n+==+=+=+\n</pre> In\u00a0[4]: Copied! <pre>## or if you'd like to inspect the table, use:\nprint(str(t))\n</pre> ## or if you'd like to inspect the table, use: print(str(t)) <pre>Table(2 columns, 3 rows)\n</pre> In\u00a0[5]: Copied! <pre>## You can also add all columns at once (slower) if you prefer. \nt2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c')))\nassert t==t2\n</pre> ## You can also add all columns at once (slower) if you prefer.  t2 = Table(headers=('A','B'), rows=((1,'a'),(2,'b'),(3,'c'))) assert t==t2 In\u00a0[6]: Copied! <pre>## or load data:\nt3 = Table.from_file('tests/data/book1.csv')\n\n## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show(). \n## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows.\nt3\n</pre> ## or load data: t3 = Table.from_file('tests/data/book1.csv')  ## to view any table in the notebook just let jupyter show the table. If you're using the terminal use .show().  ## Note that show gives either first and last 7 rows or the whole table if it is less than 20 rows. t3 <pre>Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n</pre> <pre>importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 487.82it/s]\n</pre> Out[6]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[7]: Copied! <pre>## should you however want to select the headers instead of importing everything\n## (which maybe timeconsuming), simply use get_headers(path)\nfrom tablite.tools import get_headers\nfrom pathlib import Path\npath = Path('tests/data/book1.csv')\nsample = get_headers(path, linecount=5)\nprint(f\"sample is of type {type(sample)} and has the following entries:\")\nfor k,v in sample.items():\n    print(k)\n    if isinstance(v,list):\n        for r in sample[k]:\n            print(\"\\t\", r)\n</pre> ## should you however want to select the headers instead of importing everything ## (which maybe timeconsuming), simply use get_headers(path) from tablite.tools import get_headers from pathlib import Path path = Path('tests/data/book1.csv') sample = get_headers(path, linecount=5) print(f\"sample is of type {type(sample)} and has the following entries:\") for k,v in sample.items():     print(k)     if isinstance(v,list):         for r in sample[k]:             print(\"\\t\", r) <pre>sample is of type &lt;class 'dict'&gt; and has the following entries:\ndelimiter\nbook1.csv\n\t ['a', 'b', 'c', 'd', 'e', 'f']\n\t ['1', '0.060606061', '0.090909091', '0.121212121', '0.151515152', '0.181818182']\n\t ['2', '0.121212121', '0.242424242', '0.484848485', '0.96969697', '1.939393939']\n\t ['3', '0.242424242', '0.484848485', '0.96969697', '1.939393939', '3.878787879']\n\t ['4', '0.484848485', '0.96969697', '1.939393939', '3.878787879', '7.757575758']\n\t ['5', '0.96969697', '1.939393939', '3.878787879', '7.757575758', '15.51515152']\n</pre> In\u00a0[8]: Copied! <pre>## to extend a table by adding columns, use t[new] = [new values]\nt['C'] = [4,5,6]\n## but make sure the column has the same length as the rest of the table!\nt\n</pre> ## to extend a table by adding columns, use t[new] = [new values] t['C'] = [4,5,6] ## but make sure the column has the same length as the rest of the table! t Out[8]: #ABC 01a4 12b5 23c6 In\u00a0[9]: Copied! <pre>## should you want to mix datatypes, tablite will not complain:\nfrom datetime import datetime, date,time,timedelta\nimport numpy as np\n## What you put in ...\nt4 = Table()\nt4['mixed'] = [\n    -1,0,1,  # regular integers\n    -12345678909876543211234567890987654321,  # very very large integer\n    None,np.nan,  # null values \n    \"one\", \"\",  # strings\n    True,False,  # booleans\n    float('inf'), 0.01,  # floats\n    date(2000,1,1),   # date\n    datetime(2002,2,3,23,0,4,6660),  # datetime\n    time(12,12,12),  # time\n    timedelta(days=3, seconds=5678)  # timedelta\n]\n## ... is exactly what you get out:\nt4\n</pre> ## should you want to mix datatypes, tablite will not complain: from datetime import datetime, date,time,timedelta import numpy as np ## What you put in ... t4 = Table() t4['mixed'] = [     -1,0,1,  # regular integers     -12345678909876543211234567890987654321,  # very very large integer     None,np.nan,  # null values      \"one\", \"\",  # strings     True,False,  # booleans     float('inf'), 0.01,  # floats     date(2000,1,1),   # date     datetime(2002,2,3,23,0,4,6660),  # datetime     time(12,12,12),  # time     timedelta(days=3, seconds=5678)  # timedelta ] ## ... is exactly what you get out: t4 Out[9]: #mixed 0-1 10 21 3-12345678909876543211234567890987654321 4None 5nan 6one 7 8True 9False10inf110.01122000-01-01132002-02-03 23:00:04.0066601412:12:12153 days, 1:34:38 In\u00a0[10]: Copied! <pre>## also if you claim the values back as a python list:\nfor item in list(t4['mixed']):\n    print(item)\n</pre> ## also if you claim the values back as a python list: for item in list(t4['mixed']):     print(item) <pre>-1\n0\n1\n-12345678909876543211234567890987654321\nNone\nnan\none\n\nTrue\nFalse\ninf\n0.01\n2000-01-01\n2002-02-03 23:00:04.006660\n12:12:12\n3 days, 1:34:38\n</pre> <p>The column itself (<code>__repr__</code>) shows us the <code>pid</code>, <code>file location</code> and the entries, so you know exactly what you're working with.</p> In\u00a0[11]: Copied! <pre>t4['mixed']\n</pre> t4['mixed'] Out[11]: <pre>Column(/tmp/tablite-tmp/pid-54911, [-1 0 1 -12345678909876543211234567890987654321 None nan 'one' '' True\n False inf 0.01 datetime.date(2000, 1, 1)\n datetime.datetime(2002, 2, 3, 23, 0, 4, 6660) datetime.time(12, 12, 12)\n datetime.timedelta(days=3, seconds=5678)])</pre> In\u00a0[12]: Copied! <pre>## to view the datatypes in a column, use Column.types()\ntype_dict = t4['mixed'].types()\nfor k,v in type_dict.items():\n    print(k,v)\n</pre> ## to view the datatypes in a column, use Column.types() type_dict = t4['mixed'].types() for k,v in type_dict.items():     print(k,v) <pre>&lt;class 'int'&gt; 4\n&lt;class 'NoneType'&gt; 1\n&lt;class 'float'&gt; 3\n&lt;class 'str'&gt; 2\n&lt;class 'bool'&gt; 2\n&lt;class 'datetime.date'&gt; 1\n&lt;class 'datetime.datetime'&gt; 1\n&lt;class 'datetime.time'&gt; 1\n&lt;class 'datetime.timedelta'&gt; 1\n</pre> In\u00a0[13]: Copied! <pre>## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file.\n## This is because tablite guesses the most probable datatype using the `.guess` function on each column.\n## You can use the .guess function like this:\nfrom tablite import DataTypes\nt3['a'] = DataTypes.guess(t3['a'])\n## You can also convert the datatype using a list comprehension\nt3['b'] = [float(v) for v in t3['b']]\nt3\n</pre> ## You may have noticed that all datatypes in t3 where identified as floats, despite their origin from a text type file. ## This is because tablite guesses the most probable datatype using the `.guess` function on each column. ## You can use the .guess function like this: from tablite import DataTypes t3['a'] = DataTypes.guess(t3['a']) ## You can also convert the datatype using a list comprehension t3['b'] = [float(v) for v in t3['b']] t3 Out[13]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[14]: Copied! <pre>t = Table()\nfor column_name in 'abcde':\n    t[column_name] =[i for i in range(5)]\n</pre> t = Table() for column_name in 'abcde':     t[column_name] =[i for i in range(5)] <p>(2) we want to add two new columns using the functions:</p> In\u00a0[15]: Copied! <pre>def f1(a,b,c):\n    return a+b+c+1\ndef f2(b,c,d):\n    return b*c*d\n</pre> def f1(a,b,c):     return a+b+c+1 def f2(b,c,d):     return b*c*d <p>(3) and we want to compute two new columns <code>f</code> and <code>g</code>:</p> In\u00a0[16]: Copied! <pre>t.add_columns('f', 'g')\n</pre> t.add_columns('f', 'g') <p>(4) we can now use the filter, to iterate over the table, and add the values to the two new columns:</p> In\u00a0[17]: Copied! <pre>f,g=[],[]\nfor row in t['a', 'b', 'c', 'd'].rows:\n    a, b, c, d = row\n\n    f.append(f1(a, b, c))\n    g.append(f2(b, c, d))\nt['f'] = f\nt['g'] = g\n\nassert len(t) == 5\nassert list(t.columns) == list('abcdefg')\nt\n</pre> f,g=[],[] for row in t['a', 'b', 'c', 'd'].rows:     a, b, c, d = row      f.append(f1(a, b, c))     g.append(f2(b, c, d)) t['f'] = f t['g'] = g  assert len(t) == 5 assert list(t.columns) == list('abcdefg') t Out[17]: #abcdefg 00000010 11111141 22222278 3333331027 4444441364 <p>Take note that if your dataset is assymmetric, a warning will be show:</p> In\u00a0[18]: Copied! <pre>assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]})\nfor row in assymmetric_table.rows:\n    print(row)\n## warning at the bottom ---v\n</pre> assymmetric_table = Table({'a':[1,2,3], 'b':[1,2]}) for row in assymmetric_table.rows:     print(row) ## warning at the bottom ---v <pre>[1, 1]\n[2, 2]\n[3, None]\n</pre> <pre>/home/bjorn/github/tablite/tablite/base.py:1188: UserWarning: Column b has length 2 / 3. None will appear as fill value.\n  warnings.warn(f\"Column {name} has length {len(column)} / {n_max}. None will appear as fill value.\")\n</pre> In\u00a0[19]: Copied! <pre>table7 = Table(columns={\n'A': [1,1,2,2,3,4],\n'B': [1,1,2,2,30,40],\n'C': [-1,-2,-3,-4,-5,-6]\n})\nindex = table7.index('A', 'B')\nfor k, v in index.items():\n    print(\"key\", k, \"indices\", v)\n</pre> table7 = Table(columns={ 'A': [1,1,2,2,3,4], 'B': [1,1,2,2,30,40], 'C': [-1,-2,-3,-4,-5,-6] }) index = table7.index('A', 'B') for k, v in index.items():     print(\"key\", k, \"indices\", v) <pre>key (1, 1) indices [0, 1]\nkey (2, 2) indices [2, 3]\nkey (3, 30) indices [4]\nkey (4, 40) indices [5]\n</pre> <p>The keys are created for each unique column-key-pair, and the value is the index where the key is found. To fetch all rows for key <code>(2,2)</code>, we can use:</p> In\u00a0[20]: Copied! <pre>for ix, row in enumerate(table7.rows):\n    if ix in index[(2,2)]:\n        print(row)\n</pre> for ix, row in enumerate(table7.rows):     if ix in index[(2,2)]:         print(row) <pre>[2, 2, -3]\n[2, 2, -4]\n</pre> In\u00a0[21]: Copied! <pre>## to append one table to another, use + or += \nprint('length before:', len(t3))  # length before: 45\nt5 = t3 + t3  \nprint('length after +', len(t5))  # length after + 90\nt5 += t3 \nprint('length after +=', len(t5))  # length after += 135\n## if you need a lot of numbers for a test, you can repeat a table using * and *=\nt5 *= 1_000\nprint('length after +=', len(t5))  # length after += 135000\n</pre> ## to append one table to another, use + or +=  print('length before:', len(t3))  # length before: 45 t5 = t3 + t3   print('length after +', len(t5))  # length after + 90 t5 += t3  print('length after +=', len(t5))  # length after += 135 ## if you need a lot of numbers for a test, you can repeat a table using * and *= t5 *= 1_000 print('length after +=', len(t5))  # length after += 135000 <pre>length before: 45\nlength after + 90\nlength after += 135\nlength after += 135000\n</pre> In\u00a0[22]: Copied! <pre>t5\n</pre> t5   Out[22]: #abcdef       010.0606060610.0909090910.1212121210.1515151520.181818182       120.1212121210.2424242420.4848484850.969696971.939393939       230.2424242420.4848484850.969696971.9393939393.878787879       340.4848484850.969696971.9393939393.8787878797.757575758       450.969696971.9393939393.8787878797.75757575815.51515152       561.9393939393.8787878797.75757575815.5151515231.03030303       673.8787878797.75757575815.5151515231.0303030362.06060606..................... 134,9933916659267088.033318534175.066637068350.0133274000000.0266548000000.0 134,9944033318534175.066637068350.0133274000000.0266548000000.0533097000000.0 134,9954166637068350.0133274000000.0266548000000.0533097000000.01066190000000.0 134,99642133274000000.0266548000000.0533097000000.01066190000000.02132390000000.0 134,99743266548000000.0533097000000.01066190000000.02132390000000.04264770000000.0 134,99844533097000000.01066190000000.02132390000000.04264770000000.08529540000000.0 134,999451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 In\u00a0[23]: Copied! <pre>## if your are in doubt whether your tables will be the same you can use .stack(other)\nassert t.columns != t2.columns  # compares list of column names.\nt6 = t.stack(t2)\nt6\n</pre> ## if your are in doubt whether your tables will be the same you can use .stack(other) assert t.columns != t2.columns  # compares list of column names. t6 = t.stack(t2) t6 Out[23]: #abcdefgAB 00000010NoneNone 11111141NoneNone 22222278NoneNone 3333331027NoneNone 4444441364NoneNone 5NoneNoneNoneNoneNoneNoneNone1a 6NoneNoneNoneNoneNoneNoneNone2b 7NoneNoneNoneNoneNoneNoneNone3c In\u00a0[24]: Copied! <pre>## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns.\n\n## if you need a more detailed view of the columns you can iterate:\nfor name in t.columns:\n    col_from_t = t[name]\n    if name in t2.columns:\n        col_from_t2 = t2[name]\n        print(name, col_from_t == col_from_t2)\n    else:\n        print(name, \"not in t2\")\n</pre> ## As you can see above, t6['C'] is padded with \"None\" where t2 was missing the columns.  ## if you need a more detailed view of the columns you can iterate: for name in t.columns:     col_from_t = t[name]     if name in t2.columns:         col_from_t2 = t2[name]         print(name, col_from_t == col_from_t2)     else:         print(name, \"not in t2\") <pre>a not in t2\nb not in t2\nc not in t2\nd not in t2\ne not in t2\nf not in t2\ng not in t2\n</pre> In\u00a0[25]: Copied! <pre>## to make a copy of a table, use table.copy()\nt3_copy = t3.copy()\n\n## you can also perform multi criteria selections using getitem [ ... ]\nt3_slice = t3['a','b','d', 5:25:5]\nt3_slice\n</pre> ## to make a copy of a table, use table.copy() t3_copy = t3.copy()  ## you can also perform multi criteria selections using getitem [ ... ] t3_slice = t3['a','b','d', 5:25:5] t3_slice Out[25]: #abd 061.9393939397.757575758 11162.06060606248.2424242 2161985.9393947943.757576 32163550.06061254200.2424 In\u00a0[26]: Copied! <pre>##deleting items also works the same way:\ndel t3_slice[1:3]  # delete row number 2 &amp; 3 \nt3_slice\n</pre> ##deleting items also works the same way: del t3_slice[1:3]  # delete row number 2 &amp; 3  t3_slice Out[26]: #abd 061.9393939397.757575758 12163550.06061254200.2424 In\u00a0[27]: Copied! <pre>## to wipe a table, use .clear:\nt3_slice.clear()\nt3_slice\n</pre> ## to wipe a table, use .clear: t3_slice.clear() t3_slice Out[27]: Empty Table In\u00a0[28]: Copied! <pre>## tablite uses .npy for storage because it is fast.\n## this means you can make a table persistent using .save\nlocal_file = Path(\"local_file.tpz\")\nt5.save(local_file)\n\nold_t5 = Table.load(local_file)\nprint(\"the t5 table had\", len(old_t5), \"rows\")  # the t5 table had 135000 rows\n\ndel old_t5  # only removes the in-memory object\n\nprint(\"old_t5 still exists?\", local_file.exists())\nprint(\"path:\", local_file)\n\nimport os\nos.remove(local_file)\n</pre> ## tablite uses .npy for storage because it is fast. ## this means you can make a table persistent using .save local_file = Path(\"local_file.tpz\") t5.save(local_file)  old_t5 = Table.load(local_file) print(\"the t5 table had\", len(old_t5), \"rows\")  # the t5 table had 135000 rows  del old_t5  # only removes the in-memory object  print(\"old_t5 still exists?\", local_file.exists()) print(\"path:\", local_file)  import os os.remove(local_file) <pre>loading 'local_file.tpz' file:  55%|\u2588\u2588\u2588\u2588\u2588\u258d    | 9851/18000 [00:02&lt;00:01, 4386.96it/s]</pre> <pre>loading 'local_file.tpz' file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 18000/18000 [00:04&lt;00:00, 4417.27it/s]\n</pre> <pre>the t5 table had 135000 rows\nold_t5 still exists? True\npath: local_file.tpz\n</pre> <p>If you want to save a table from one session to another use <code>save=True</code>. This tells the garbage collector to leave the tablite Table on disk, so you can load it again without changing your code.</p> <p>For example:</p> <p>First time you run <code>t = Table.import_file(....big.csv)</code> it may take a minute or two.</p> <p>If you then add <code>t.save=True</code> and restart python, the second time you run  <code>t = Table.import_file(....big.csv)</code> it will take a few milliseconds instead of minutes.</p> In\u00a0[29]: Copied! <pre>unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]})\n</pre> unfiltered = Table({'a':[1,2,3,4], 'b':[10,20,30,40]}) In\u00a0[30]: Copied! <pre>true,false = unfiltered.filter(\n    [\n        {\"column1\": 'a', \"criteria\":\"&gt;=\", 'value2':3}\n    ], filter_type='all'\n)\n</pre> true,false = unfiltered.filter(     [         {\"column1\": 'a', \"criteria\":\"&gt;=\", 'value2':3}     ], filter_type='all' ) In\u00a0[31]: Copied! <pre>true\n</pre> true Out[31]: #ab 0330 1440 In\u00a0[32]: Copied! <pre>false.show()  # using show here to show that terminal users can have a nice view too.\n</pre> false.show()  # using show here to show that terminal users can have a nice view too. <pre>+==+=+==+\n|# |a|b |\n+--+-+--+\n| 0|1|10|\n| 1|2|20|\n+==+=+==+\n</pre> In\u00a0[33]: Copied! <pre>ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]})\n</pre> ty = Table({'a':[1,2,3,4],'b': [10,20,30,40]}) In\u00a0[34]: Copied! <pre>## typical python\nany(i &gt; 3 for i in ty['a'])\n</pre> ## typical python any(i &gt; 3 for i in ty['a']) Out[34]: <pre>True</pre> In\u00a0[35]: Copied! <pre>## hereby you can do:\nany( ty.any(**{'a':lambda x:x&gt;3}).rows )\n</pre> ## hereby you can do: any( ty.any(**{'a':lambda x:x&gt;3}).rows ) Out[35]: <pre>True</pre> In\u00a0[36]: Copied! <pre>## if you have multiple criteria this also works:\nall( ty.all(**{'a': lambda x:x&gt;=2, 'b': lambda x:x&lt;=30}).rows )\n</pre> ## if you have multiple criteria this also works: all( ty.all(**{'a': lambda x:x&gt;=2, 'b': lambda x:x&lt;=30}).rows ) Out[36]: <pre>True</pre> In\u00a0[37]: Copied! <pre>## or this if you want to see the table.\nty.all(a=lambda x:x&gt;2, b=lambda x:x&lt;=30)\n</pre> ## or this if you want to see the table. ty.all(a=lambda x:x&gt;2, b=lambda x:x&lt;=30) Out[37]: #ab 0330 In\u00a0[38]: Copied! <pre>## As `all` and `any` returns tables, this also means that you can chain operations:\nty.any(a=lambda x:x&gt;2).any(b=30)\n</pre> ## As `all` and `any` returns tables, this also means that you can chain operations: ty.any(a=lambda x:x&gt;2).any(b=30) Out[38]: #ab 0330 In\u00a0[39]: Copied! <pre>table = Table({\n    'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9],\n    'B':[10,'100', 1, 1, 1, 1, 10, 10, 10],\n    'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0],\n})\ntable\n</pre> table = Table({     'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9],     'B':[10,'100', 1, 1, 1, 1, 10, 10, 10],     'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0], }) table Out[39]: #ABC 01100 1None1001 2810 3311 4410 5611 65100 77101 89100 In\u00a0[40]: Copied! <pre>sort_order = {'B': False, 'C': False, 'A': False}\nassert not table.is_sorted(mapping=sort_order)\n\nsorted_table = table.sort(mapping=sort_order)\nsorted_table\n</pre> sort_order = {'B': False, 'C': False, 'A': False} assert not table.is_sorted(mapping=sort_order)  sorted_table = table.sort(mapping=sort_order) sorted_table <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 2719.45it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 3434.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 1902.47it/s]\n</pre> <p>Sort is reasonable effective as it uses multiprocessing above a million fields.</p> <p>Hint: You can set this limit in <code>tablite.config</code>, like this:</p> In\u00a0[41]: Copied! <pre>from tablite.config import Config\nprint(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\")\n</pre> from tablite.config import Config print(f\"multiprocessing is used above {Config.SINGLE_PROCESSING_LIMIT:,} fields\") <pre>multiprocessing is used above 1,000,000 fields\n</pre> In\u00a0[42]: Copied! <pre>import math\nn = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n    'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,\n    'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n    'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n,\n})\ntable\n</pre> import math n = math.ceil(1_000_000 / (9*3))  table = Table({     'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,     'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,     'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n, }) table Out[42]: #ABC       01100       1None1001       2810       3311       4410       5611       65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[43]: Copied! <pre>import time as cputime\nstart = cputime.time()\nsort_order = {'B': False, 'C': False, 'A': False}\nsorted_table = table.sort(mapping=sort_order)  # sorts 1M values.\nprint(\"table sorting took \", round(cputime.time() - start,3), \"secs\")\nsorted_table\n</pre> import time as cputime start = cputime.time() sort_order = {'B': False, 'C': False, 'A': False} sorted_table = table.sort(mapping=sort_order)  # sorts 1M values. print(\"table sorting took \", round(cputime.time() - start,3), \"secs\") sorted_table <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00,  4.20it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 18.17it/s]</pre> <pre>table sorting took  0.913 secs\n</pre> <pre>\n</pre> In\u00a0[44]: Copied! <pre>n = math.ceil(1_000_000 / (9*3))\n\ntable = Table({\n    'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,\n    'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,\n    'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n,\n})\ntable\n</pre> n = math.ceil(1_000_000 / (9*3))  table = Table({     'A':[ 1, None, 8, 3, 4, 6,  5,  7,  9]*n,     'B':[10,'100', 1, 1, 1, 1, 10, 10, 10]*n,     'C':[ 0,    1, 0, 1, 0, 1,  0,  1,  0]*n, }) table  Out[44]: #ABC       01100       1None1001       2810       3311       4410       5611       65100............ 333,335810 333,336311 333,337410 333,338611 333,3395100 333,3407101 333,3419100 In\u00a0[45]: Copied! <pre>from tablite import GroupBy as gb\ngrpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)])\ngrpby\n</pre> from tablite import GroupBy as gb grpby = table.groupby(keys=['C', 'B'], functions=[('A', gb.count)]) grpby <pre>groupby: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 333342/333342 [00:00&lt;00:00, 427322.50it/s]\n</pre> Out[45]: #CBCount(A) 0010111114 1110037038 20174076 31174076 411037038 <p>Here is the list of groupby functions:</p> <pre><code>class GroupBy(object):    \n    max = Max  # shortcuts to avoid having to type a long list of imports.\n    min = Min\n    sum = Sum\n    product = Product\n    first = First\n    last = Last\n    count = Count\n    count_unique = CountUnique\n    avg = Average\n    stdev = StandardDeviation\n    median = Median\n    mode = Mode\n</code></pre> In\u00a0[46]: Copied! <pre>t = Table({\n    'A':[1, 1, 2, 2, 3, 3] * 2,\n    'B':[1, 2, 3, 4, 5, 6] * 2,\n    'C':[6, 5, 4, 3, 2, 1] * 2,\n})\nt\n</pre> t = Table({     'A':[1, 1, 2, 2, 3, 3] * 2,     'B':[1, 2, 3, 4, 5, 6] * 2,     'C':[6, 5, 4, 3, 2, 1] * 2, }) t Out[46]: #ABC 0116 1125 2234 3243 4352 5361 6116 7125 8234 92431035211361 In\u00a0[47]: Copied! <pre>t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False)\nt2\n</pre> t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum), ('B', gb.count)], values_as_rows=False) t2 <pre>pivot: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 14/14 [00:00&lt;00:00, 3643.83it/s]\n</pre> Out[47]: #CSum(B,A=1)Count(B,A=1)Sum(B,A=2)Count(B,A=2)Sum(B,A=3)Count(B,A=3) 0622NoneNoneNoneNone 1542NoneNoneNoneNone 24NoneNone62NoneNone 33NoneNone82NoneNone 42NoneNoneNoneNone102 51NoneNoneNoneNone122 In\u00a0[48]: Copied! <pre>numbers = Table()\nnumbers.add_column('number', data=[      1,      2,       3,       4,   None])\nnumbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue'])\n\nletters = Table()\nletters.add_column('letter', data=[  'a',     'b',      'c',     'd',   None])\nletters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue'])\n</pre> numbers = Table() numbers.add_column('number', data=[      1,      2,       3,       4,   None]) numbers.add_column('colour', data=['black', 'blue', 'white', 'white', 'blue'])  letters = Table() letters.add_column('letter', data=[  'a',     'b',      'c',     'd',   None]) letters.add_column('color', data=['blue', 'white', 'orange', 'white', 'blue'])  In\u00a0[49]: Copied! <pre>## left join\n## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nleft_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nleft_join\n</pre> ## left join ## SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color left_join = numbers.left_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) left_join <pre>join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1221.94it/s]\n</pre> Out[49]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d In\u00a0[50]: Copied! <pre>## inner join\n## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\ninner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\ninner_join\n</pre> ## inner join ## SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color inner_join = numbers.inner_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) inner_join <pre>join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1121.77it/s]\n</pre> Out[50]: #numberletter 02a 12None 2Nonea 3NoneNone 43b 53d 64b 74d In\u00a0[51]: Copied! <pre># outer join\n## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nouter_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter'])\nouter_join\n</pre>  # outer join ## SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color outer_join = numbers.outer_join(letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']) outer_join <pre>join: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1585.15it/s]\n</pre> Out[51]: #numberletter 01None 12a 22None 3Nonea 4NoneNone 53b 63d 74b 84d 9Nonec <p>Q: But ...I think there's a bug in the join... A: Venn diagrams do not explain joins.</p> <p>A Venn diagram is a widely-used diagram style that shows the logical relation between sets, popularised by John Venn in the 1880s. The diagrams are used to teach elementary set theory, and to illustrate simple set relationshipssource: en.wikipedia.org</p> <p>Joins operate over rows and when there are duplicate rows, these will be replicated in the output. Many beginners are surprised by this, because they didn't read the SQL standard.</p> <p>Q: So what do I do? A: If you want to get rid of duplicates using tablite, use the <code>index</code> functionality across all columns and pick the first row from each index. Here's the recipe that starts with plenty of duplicates:</p> In\u00a0[52]: Copied! <pre>old_table = Table({\n'A':[1,1,1,2,2,2,3,3,3],\n'B':[1,1,4,2,2,5,3,3,6],\n})\nold_table\n</pre> old_table = Table({ 'A':[1,1,1,2,2,2,3,3,3], 'B':[1,1,4,2,2,5,3,3,6], }) old_table Out[52]: #AB 011 111 214 322 422 525 633 733 836 In\u00a0[53]: Copied! <pre>## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE)\nnew_table = old_table.drop_duplicates()\nnew_table\n</pre> ## CREATE TABLE OF UNIQUE ENTRIES (a.k.a. DEDUPLICATE) new_table = old_table.drop_duplicates() new_table <pre>9it [00:00, 11329.15it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1819.26it/s]\n</pre> Out[53]: #AB 011 114 222 325 433 536 <p>You can also use groupby; We'll get to that in a minute.</p> <p>Lookup is a special case of a search loop: Say for example you are planning a concert and want to make sure that your friends can make it home using public transport: You would have to find the first departure after the concert ends towards their home. A join would only give you a direct match on the time.</p> <p>Lookup allows you \"to iterate through a list of data and find the first match given a set of criteria.\"</p> <p>Here's an example:</p> <p>First we have our list of friends and their stops.</p> In\u00a0[54]: Copied! <pre>friends = Table({\n\"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'],\n\"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'],\n})\nfriends\n</pre> friends = Table({ \"name\":['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'], \"stop\":['Downtown-1', 'Downtown-2', 'Hillside View', 'Hillside Crescent', 'Downtown-2', 'Chicago'], }) friends Out[54]: #namestop 0AliceDowntown-1 1BettyDowntown-2 2CharlieHillside View 3DorethyHillside Crescent 4EdwardDowntown-2 5FredChicago <p>Next we need a list of bus routes and their time and stops. I don't have that, so I'm making one up:</p> In\u00a0[55]: Copied! <pre>import random\nrandom.seed(11)\ntable_size = 40\n\ntimes = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)]\nstops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2',\n            'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)]\nroute = [random.choice([1, 2, 3]) for i in stops]\n</pre>  import random random.seed(11) table_size = 40  times = [DataTypes.time(random.randint(21, 23), random.randint(0, 59)) for i in range(table_size)] stops = ['Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent', 'Downtown-1', 'Downtown-2',             'Central station'] * 2 + [f'Random Road-{i}' for i in range(table_size)] route = [random.choice([1, 2, 3]) for i in stops]  In\u00a0[56]: Copied! <pre>bus_table = Table({\n\"time\":times,\n\"stop\":stops[:table_size],\n\"route\":route[:table_size],\n})\nbus_table.sort(mapping={'time': False})\n\nprint(\"Departures from Concert Hall towards ...\")\nbus_table[0:10]\n</pre>  bus_table = Table({ \"time\":times, \"stop\":stops[:table_size], \"route\":route[:table_size], }) bus_table.sort(mapping={'time': False})  print(\"Departures from Concert Hall towards ...\") bus_table[0:10]  <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 1459.90it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 2421.65it/s]\n</pre> <pre>Departures from Concert Hall towards ...\n</pre> Out[56]: #timestoproute 021:02:00Random Road-62 121:05:00Hillside Crescent2 221:06:00Hillside1 321:25:00Random Road-241 421:29:00Random Road-161 521:32:00Random Road-211 621:33:00Random Road-121 721:36:00Random Road-233 821:38:00Central station2 921:38:00Random Road-82 <p>Let's say the concerts ends at 21:00 and it takes a 10 minutes to get to the bus-stop. Earliest departure must then be 21:10 - goodbye hugs included.</p> In\u00a0[57]: Copied! <pre>lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"&lt;=\", 'time'), ('stop', \"==\", 'stop'))\nlookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix')\nlookup1_sorted\n</pre> lookup_1 = friends.lookup(bus_table, (DataTypes.time(21, 10), \"&lt;=\", 'time'), ('stop', \"==\", 'stop')) lookup1_sorted = lookup_1.sorted(mapping={'time': False, 'name':False}, sort_mode='unix') lookup1_sorted <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 6/6 [00:00&lt;00:00, 1513.92it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 3/3 [00:00&lt;00:00, 2003.65it/s]\ncreating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 2589.88it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 5/5 [00:00&lt;00:00, 2034.29it/s]\n</pre> Out[57]: #namestoptimestop_1route 0FredChicagoNoneNoneNone 1BettyDowntown-221:51:00Downtown-21 2EdwardDowntown-221:51:00Downtown-21 3CharlieHillside View22:19:00Hillside View2 4AliceDowntown-123:12:00Downtown-13 5DorethyHillside Crescent23:54:00Hillside Crescent1 <p>Lookup's ability to custom criteria is thereby far more versatile than SQL joins.</p> <p>But with great power comes great responsibility.</p> In\u00a0[58]: Copied! <pre>materials = Table({\n    'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9], \n    'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6], \n    'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'], \n    'material_id': [None, None, None, 3, None, 5, 3, 3, 5], \n    'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90]\n})\n    # 9 is a partially packed pallet of 6\n\n## multiple values.\nlooking_for = Table({\n    'bom_id': [3,4,6], \n    'moq': [1,2,3]\n    })\n</pre> materials = Table({     'bom_id': [1, 2, 3, 4, 5, 6, 7, 8, 9],      'partial_of': [1, 2, 3, 4, 5, 6, 7, 4, 6],      'sku': ['A', 'irrelevant', 'empty carton', 'pkd carton', 'empty pallet', 'pkd pallet', 'pkd irrelevant', 'ppkd carton', 'ppkd pallet'],      'material_id': [None, None, None, 3, None, 5, 3, 3, 5],      'quantity': [10, 20, 30, 40, 50, 60, 70, 80, 90] })     # 9 is a partially packed pallet of 6  ## multiple values. looking_for = Table({     'bom_id': [3,4,6],      'moq': [1,2,3]     })  <p>Our goals is now to find the quantity from the <code>materials</code> table based on the items in the <code>looking_for</code> table.</p> <p>This requires two steps:</p> <ol> <li>lookup</li> <li>filter for <code>all</code> by dropping items that didn't match.</li> </ol> In\u00a0[59]: Copied! <pre>## step 1/2:\nproducts_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False)   \nproducts_lookup\n</pre> ## step 1/2: products_lookup = materials.lookup(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"), all=False)    products_lookup <pre>100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 9/9 [00:00&lt;00:00, 3651.81it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1625.38it/s]\n</pre> Out[59]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 011ANone10NoneNone 122irrelevantNone20NoneNone 233empty cartonNone3031 344pkd carton34042 455empty palletNone50NoneNone 566pkd pallet56063 677pkd irrelevant370NoneNone 784ppkd carton38042 896ppkd pallet59063 In\u00a0[60]: Copied! <pre>## step 2/2:\nproducts = products_lookup.all(bom_id_1=lambda x: x is not None)\nproducts\n</pre> ## step 2/2: products = products_lookup.all(bom_id_1=lambda x: x is not None) products Out[60]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 <p>The faster way to solve this problem is to use <code>match</code>!</p> <p>Here is the example:</p> In\u00a0[61]: Copied! <pre>products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\"))\nproducts_matched\n</pre> products_matched = materials.match(looking_for, (\"bom_id\", \"==\", \"bom_id\"), (\"partial_of\", \"==\", \"bom_id\")) products_matched Out[61]: #bom_idpartial_ofskumaterial_idquantitybom_id_1moq 033empty cartonNone3031 144pkd carton34042 266pkd pallet56063 384ppkd carton38042 496ppkd pallet59063 In\u00a0[62]: Copied! <pre>assert products == products_matched\n</pre> assert products == products_matched In\u00a0[63]: Copied! <pre>from tablite import Table\nt = Table()  # create table\nt.add_columns('row','A','B','C')  # add columns\n</pre> from tablite import Table t = Table()  # create table t.add_columns('row','A','B','C')  # add columns <p>The following examples are all valid and append the row (1,2,3) to the table.</p> In\u00a0[64]: Copied! <pre>t.add_rows(1, 1, 2, 3)  # individual values\nt.add_rows([2, 1, 2, 3])  # list of values\nt.add_rows((3, 1, 2, 3))  # tuple of values\nt.add_rows(*(4, 1, 2, 3))  # unpacked tuple\nt.add_rows(row=5, A=1, B=2, C=3)   # keyword - args\nt.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})  # dict / json.\n</pre> t.add_rows(1, 1, 2, 3)  # individual values t.add_rows([2, 1, 2, 3])  # list of values t.add_rows((3, 1, 2, 3))  # tuple of values t.add_rows(*(4, 1, 2, 3))  # unpacked tuple t.add_rows(row=5, A=1, B=2, C=3)   # keyword - args t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})  # dict / json. <p>The following examples add two rows to the table</p> In\u00a0[65]: Copied! <pre>t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))  # two (or more) tuples.\nt.add_rows([9, 1, 2, 3], [10, 4, 5, 6])  # two or more lists\nt.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3},\n          {'row': 12, 'A': 4, 'B': 5, 'C': 6})  # two (or more) dicts as args.\nt.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3},\n            {'row': 14, 'A': 1, 'B': 2, 'C': 3}])  # list of dicts.\n</pre> t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))  # two (or more) tuples. t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])  # two or more lists t.add_rows({'row': 11, 'A': 1, 'B': 2, 'C': 3},           {'row': 12, 'A': 4, 'B': 5, 'C': 6})  # two (or more) dicts as args. t.add_rows(*[{'row': 13, 'A': 1, 'B': 2, 'C': 3},             {'row': 14, 'A': 1, 'B': 2, 'C': 3}])  # list of dicts. In\u00a0[66]: Copied! <pre>t\n</pre> t Out[66]: #rowABC 01123 12123 23123 34123 45123 56123 67123 78456 89123 9104561011123111245612131231314123 <p>As the row incremented from <code>1</code> in the first of these examples, and finished with <code>row: 14</code>, you can now see the whole table above</p> In\u00a0[67]: Copied! <pre>from pathlib import Path\npath = Path('tests/data/book1.csv')\ntx = Table.from_file(path)\ntx\n</pre> from pathlib import Path path = Path('tests/data/book1.csv') tx = Table.from_file(path) tx  <pre>Collecting tasks: 'tests/data/book1.csv'\nDumping tasks: 'tests/data/book1.csv'\n</pre> <pre>importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 444.08it/s]\n</pre> Out[67]: #abcdef 010.0606060610.0909090910.1212121210.1515151520.181818182 120.1212121210.2424242420.4848484850.969696971.939393939 230.2424242420.4848484850.969696971.9393939393.878787879 340.4848484850.969696971.9393939393.8787878797.757575758 450.969696971.9393939393.8787878797.75757575815.51515152 561.9393939393.8787878797.75757575815.5151515231.03030303 673.8787878797.75757575815.5151515231.0303030362.06060606.....................383916659267088.033318534175.066637068350.0133274000000.0266548000000.0394033318534175.066637068350.0133274000000.0266548000000.0533097000000.0404166637068350.0133274000000.0266548000000.0533097000000.01066190000000.04142133274000000.0266548000000.0533097000000.01066190000000.02132390000000.04243266548000000.0533097000000.01066190000000.02132390000000.04264770000000.04344533097000000.01066190000000.02132390000000.04264770000000.08529540000000.044451066190000000.02132390000000.04264770000000.08529540000000.017059100000000.0 <p>Note that you can also add start, limit and chunk_size to the file reader. Here's an example:</p> In\u00a0[68]: Copied! <pre>path = Path('tests/data/book1.csv')\ntx2 = Table.from_file(path, start=2, limit=15)\ntx2\n</pre> path = Path('tests/data/book1.csv') tx2 = Table.from_file(path, start=2, limit=15) tx2 <pre>Collecting tasks: 'tests/data/book1.csv'\n</pre> <pre>importing file: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 391.22it/s]</pre> <pre>Dumping tasks: 'tests/data/book1.csv'\n</pre> <pre>\n</pre> Out[68]: #abcdef 030.2424242420.4848484850.969696971.9393939393.878787879 140.4848484850.969696971.9393939393.8787878797.757575758 250.969696971.9393939393.8787878797.75757575815.51515152 361.9393939393.8787878797.75757575815.5151515231.03030303 473.8787878797.75757575815.5151515231.0303030362.06060606 587.75757575815.5151515231.0303030362.06060606124.1212121 6915.5151515231.0303030362.06060606124.1212121248.2424242 71031.0303030362.06060606124.1212121248.2424242496.4848485 81162.06060606124.1212121248.2424242496.4848485992.969697 912124.1212121248.2424242496.4848485992.9696971985.9393941013248.2424242496.4848485992.9696971985.9393943971.8787881114496.4848485992.9696971985.9393943971.8787887943.7575761215992.9696971985.9393943971.8787887943.75757615887.5151513161985.9393943971.8787887943.75757615887.5151531775.030314173971.8787887943.75757615887.5151531775.030363550.06061 <p>How good is the file_reader?</p> <p>I've included all formats in the test suite that are publicly available from the Alan Turing institute, dateutils) and Python's csv reader.</p> <p>What about <code>MM-DD-YYYY</code> formats? Some users from the US ask why the csv reader doesn't read the month-day-year format.</p> <p>The answer is simple: It's not an iso8601 format. The US month-day-year format is a locale that may be used a lot in the US, but it isn't an international standard.</p> <p>If you need to work with <code>MM-DD-YYYY</code> you will find that the file_reader will import the values as text (str). You can then reformat it with a custom function like:</p> In\u00a0[69]: Copied! <pre>s = \"03-21-1998\"\nfrom datetime import date\nf = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5]))\nf(s)\n</pre> s = \"03-21-1998\" from datetime import date f = lambda s: date(int(s[-4:]), int(s[:2]), int(s[3:5])) f(s) Out[69]: <pre>datetime.date(1998, 3, 21)</pre> In\u00a0[70]: Copied! <pre>from tablite.import_utils import file_readers\nfor k,v in file_readers.items():\n    print(k,v)\n</pre> from tablite.import_utils import file_readers for k,v in file_readers.items():     print(k,v) <pre>fods &lt;function excel_reader at 0x7f36a3ef8c10&gt;\njson &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nhtml &lt;function from_html at 0x7f36a3ef8b80&gt;\nhdf5 &lt;function from_hdf5 at 0x7f36a3ef8a60&gt;\nsimple &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nrst &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nmediawiki &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nxlsx &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nxls &lt;function excel_reader at 0x7f36a3ef8c10&gt;\nxlsm &lt;function excel_reader at 0x7f36a3ef8c10&gt;\ncsv &lt;function text_reader at 0x7f36a3ef9000&gt;\ntsv &lt;function text_reader at 0x7f36a3ef9000&gt;\ntxt &lt;function text_reader at 0x7f36a3ef9000&gt;\nods &lt;function ods_reader at 0x7f36a3ef8ca0&gt;\n</pre> <p>(2) define your new file reader</p> In\u00a0[71]: Copied! <pre>def my_magic_reader(path, **kwargs):   # define your new file reader.\n    print(\"do magic with {path}\")\n    return\n</pre> def my_magic_reader(path, **kwargs):   # define your new file reader.     print(\"do magic with {path}\")     return <p>(3) add it to the list of readers.</p> In\u00a0[72]: Copied! <pre>file_readers['my_special_format'] = my_magic_reader\n</pre> file_readers['my_special_format'] = my_magic_reader <p>The <code>file_readers</code> are all in tablite.core so if you intend to extend the readers, I recommend that you start here.</p> In\u00a0[73]: Copied! <pre>file = Path('example.xlsx')\ntx2.to_xlsx(file)\nos.remove(file)\n</pre> file = Path('example.xlsx') tx2.to_xlsx(file) os.remove(file) <p></p> In\u00a0[74]: Copied! <pre>from tablite import Table\n\nt = Table({\n'a':[1, 2, 8, 3, 4, 6, 5, 7, 9],\n'b':[10, 100, 3, 4, 16, -1, 10, 10, 10],\n})\nt.sort(mapping={\"a\":False})\nt\n</pre> from tablite import Table  t = Table({ 'a':[1, 2, 8, 3, 4, 6, 5, 7, 9], 'b':[10, 100, 3, 4, 16, -1, 10, 10, 10], }) t.sort(mapping={\"a\":False}) t <pre>creating sort index: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00&lt;00:00, 1674.37it/s]\njoin: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2/2 [00:00&lt;00:00, 1701.89it/s]\n</pre> Out[74]: #ab 0110 12100 234 3416 4510 56-1 6710 783 8910 In\u00a0[75]: Copied! <pre>%pip install matplotlib -q\n</pre> %pip install matplotlib -q <pre>Note: you may need to restart the kernel to use updated packages.\n</pre> In\u00a0[76]: Copied! <pre>import matplotlib.pyplot as plt\nplt.plot(t['a'], t['b'])\nplt.ylabel('Hello Figure')\nplt.show()\n</pre> import matplotlib.pyplot as plt plt.plot(t['a'], t['b']) plt.ylabel('Hello Figure') plt.show() In\u00a0[77]: Copied! <pre>## Let's monitor the memory and record the observations into a table!\nimport psutil, os, gc\nfrom time import process_time,sleep\nprocess = psutil.Process(os.getpid())\n\ndef mem_time():  # go and check taskmanagers memory usage.\n    return process.memory_info().rss, process_time()\n\ndigits = 1_000_000\n\nrecords = Table({'method':[], 'memory':[], 'time':[]})\n</pre> ## Let's monitor the memory and record the observations into a table! import psutil, os, gc from time import process_time,sleep process = psutil.Process(os.getpid())  def mem_time():  # go and check taskmanagers memory usage.     return process.memory_info().rss, process_time()  digits = 1_000_000  records = Table({'method':[], 'memory':[], 'time':[]})  <p>The row based format: 1 million 10-tuples</p> In\u00a0[78]: Copied! <pre>before, start = mem_time()\nL = [tuple([11 for _ in range(10)]) for _ in range(digits)]\nafter, end = mem_time()  \ndel L\ngc.collect()\n\nrecords.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4)))\nrecords\n</pre> before, start = mem_time() L = [tuple([11 for _ in range(10)]) for _ in range(digits)] after, end = mem_time()   del L gc.collect()  records.add_rows(*('1e6 lists w. 10 integers', after - before, round(end-start,4))) records Out[78]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 <p>The column based format: 10 columns with 1M values:</p> In\u00a0[79]: Copied! <pre>before, start = mem_time()\nL = [[11 for i2 in range(digits)] for i1 in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4)))\n</pre> before, start = mem_time() L = [[11 for i2 in range(digits)] for i1 in range(10)] after,end = mem_time()  del L gc.collect() records.add_rows(('10 lists with 1e6 integers', after - before, round(end-start,4))) <p>We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists.</p> <p>Q: But why didn't I just use an array? It would have even lower memory footprint.</p> <p>A: First, array's don't handle None's and we get that frequently in dirty csv data.</p> <p>Second, Table needs even less memory.</p> <p>Let's try with an array:</p> In\u00a0[80]: Copied! <pre>import array\n\nbefore, start = mem_time()\nL = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)]\nafter,end = mem_time()\n\ndel L\ngc.collect()\nrecords.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4)))\nrecords\n</pre> import array  before, start = mem_time() L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)] after,end = mem_time()  del L gc.collect() records.add_rows(('10 lists with 1e6 integers in arrays', after - before, round(end-start,4))) records Out[80]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 <p>Finally let's use a <code>tablite.Table</code>:</p> In\u00a0[81]: Copied! <pre>before,start = mem_time()\nt = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)})\nafter,end = mem_time()\n\nrecords.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4)))\n\nbefore,start = mem_time()\nt2 = t.copy()\nafter,end = mem_time()\n\nrecords.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4)))\n\n## Let's show it, so we know nobody's cheating:\nt2\n</pre> before,start = mem_time() t = Table(columns={str(i1): [11 for i2 in range(digits)] for i1 in range(10)}) after,end = mem_time()  records.add_rows(('Table with 10 columns with 1e6 integers', after - before, round(end-start,4)))  before,start = mem_time() t2 = t.copy() after,end = mem_time()  records.add_rows(('2 Tables with 10 columns with 1e6 integers each', after - before, round(end-start,4)))  ## Let's show it, so we know nobody's cheating: t2  Out[81]: #0123456789        011111111111111111111        111111111111111111111        211111111111111111111        311111111111111111111        411111111111111111111        511111111111111111111        611111111111111111111.................................  999,99311111111111111111111  999,99411111111111111111111  999,99511111111111111111111  999,99611111111111111111111  999,99711111111111111111111  999,99811111111111111111111  999,99911111111111111111111 In\u00a0[82]: Copied! <pre>records\n</pre> records Out[82]: #methodmemorytime 01e6 lists w. 10 integers1190543360.5045 110 lists with 1e6 integers752762880.1906 210 lists with 1e6 integers in arrays398336000.3633 3Table with 10 columns with 1e6 integers01.9569 42 Tables with 10 columns with 1e6 integers each00.0001 <p>Conclusion: whilst the common worst case (1M lists with 10 integers) take up 118 Mb of RAM, Tablite's tables vanish in the noise of memory measurement.</p> <p>Pandas also permits the usage of namedtuples, which are unpacked upon entry.</p> <pre>from collections import namedtuple\nPoint = namedtuple(\"Point\", \"x y\")\npoints = [Point(0, 0), Point(0, 3)]\npd.DataFrame(points)\n</pre> <p>Doing that in tablite is a bit different. To unpack the named tuple, you should do so explicitly:</p> <pre>t = Table({'x': [p.x for p in points], 'y': [p.y for p in points]})\n</pre> <p>However should you want to keep the points as namedtuple, you can do so in tablite:</p> <pre>t = Table()\nt['points'] = points\n</pre> <p>Tablite will store a serialised version of the points, so your memory overhead will be close to zero.</p>"},{"location":"tutorial/#tablite","title":"Tablite\u00b6","text":""},{"location":"tutorial/#introduction","title":"Introduction\u00b6","text":"<p>Tablite fills the data-science space where incremental data processing based on:</p> <ul> <li>Datasets are larger than memory.</li> <li>You don't want to worry about datatypes.</li> </ul> <p>Tablite thereby competes with:</p> <ul> <li>Pandas, but saves the memory overhead.</li> <li>Numpy, but spares you from worrying about lower level data types</li> <li>SQlite, by sheer speed.</li> <li>Polars, by working beyond RAM.</li> <li>Other libraries for data cleaning thanks to tablites powerful <code>datatypes</code> module.</li> </ul> <p>Install: <code>pip install tablite</code></p> <p>Usage:  <code>&gt;&gt;&gt; from tablite import Table</code></p> <p>Upgrade: <code>pip install tablite --no-cache --upgrade</code></p>"},{"location":"tutorial/#overview","title":"Overview\u00b6","text":"<p>(Version 2023.6.0 and later. For older version see this)</p> <ul> <li>Tablite handles all Python datatypes: <code>str</code>, <code>float</code>, <code>bool</code>, <code>int</code>, <code>date</code>, <code>datetime</code>, <code>time</code>, <code>timedelta</code> and <code>None</code>.</li> <li>you can select:<ul> <li>all rows in a column as <code>table['A']</code></li> <li>rows across all columns as <code>table[4:8]</code></li> <li>or a slice as <code>table['A', 'B', slice(4,8) ]</code>.</li> </ul> </li> <li>you to update with <code>table['A'][2] = new value</code></li> <li>you can store or send data using json, by:<ul> <li>dumping to json: <code>json_str = table.to_json()</code>, or</li> <li>you can load it with <code>Table.from_json(json_str)</code>.</li> </ul> </li> <li>you can iterate over rows using <code>for row in Table.rows</code>.</li> <li>you can ask <code>column_xyz in Table.colums</code> ?</li> <li>load from files with <code>new_table = Table.from_file('this.csv')</code> which has automatic datatype detection</li> <li>perform inner, outer &amp; left sql join between tables as simple as <code>table_1.inner_join(table2, keys=['A', 'B'])</code></li> <li>summarise using <code>table.groupby( ... )</code></li> <li>create pivot tables using <code>groupby.pivot( ... )</code></li> <li>perform multi-criteria lookup in tables using <code>table1.lookup(table2, criteria=.....</code></li> <li>and of course a large selection of tools in <code>from tablite.tools import *</code></li> </ul>"},{"location":"tutorial/#examples","title":"Examples\u00b6","text":"<p>Here are some examples:</p>"},{"location":"tutorial/#api-examples","title":"API Examples\u00b6","text":"<p>In the following sections, example are given of the Tablite API's power features:</p> <ul> <li>Iteration</li> <li>Append</li> <li>Sort</li> <li>Filter</li> <li>Index</li> <li>Search All</li> <li>Search Any</li> <li>Lookup</li> <li>Join inner, outer,</li> <li>GroupBy</li> <li>Pivot table</li> </ul>"},{"location":"tutorial/#iteration","title":"ITERATION!\u00b6","text":"<p>Iteration supports for loops and list comprehension at the speed of light:</p> <p>Just use <code>[r for r in table.rows]</code>, or:</p> <pre><code>for row in table.rows:\n    row ...</code></pre> <p>Here's a more practical use case:</p> <p>(1) Imagine a table with columns a,b,c,d,e (all integers) like this:</p>"},{"location":"tutorial/#create-index-indices","title":"Create Index / Indices\u00b6","text":"<p>Index supports multi-key indexing using args such as: <code>index = table.index('B','C')</code>.</p> <p>Here's an example:</p>"},{"location":"tutorial/#append","title":"APPEND\u00b6","text":""},{"location":"tutorial/#save","title":"SAVE\u00b6","text":""},{"location":"tutorial/#filter","title":"FILTER!\u00b6","text":""},{"location":"tutorial/#any-all","title":"Any! All?\u00b6","text":"<p>Any and All are cousins of the filter. They're there so you can use them in the same way as you'd use <code>any</code> and <code>all</code> in python - as boolean evaluators:</p>"},{"location":"tutorial/#sort","title":"SORT!\u00b6","text":""},{"location":"tutorial/#groupby","title":"GROUPBY !\u00b6","text":""},{"location":"tutorial/#did-i-say-pivot-table-yes","title":"Did I say pivot table? Yes.\u00b6","text":"<p>Pivot Table is included in the groupby functionality - so yes - you can pivot the groupby on any column that is used for grouping. Here's a simple example:</p>"},{"location":"tutorial/#join","title":"JOIN!\u00b6","text":""},{"location":"tutorial/#lookup","title":"LOOKUP!\u00b6","text":""},{"location":"tutorial/#match","title":"Match\u00b6","text":"<p>If you're looking to do a join where you afterwards remove the empty rows, <code>match</code> is the faster choice.</p> <p>Here is an example.</p> <p>Let's start with two tables:</p>"},{"location":"tutorial/#are-there-other-ways-i-can-add-data","title":"Are there other ways I can add data?\u00b6","text":"<p>Yes - but row based operations cause a lot of IO, so it'll work but be slower:</p>"},{"location":"tutorial/#okay-great-how-do-i-load-data","title":"Okay, great. How do I load data?\u00b6","text":"<p>Easy. Use <code>file_reader</code>. Here's an example:</p>"},{"location":"tutorial/#sweet-what-formats-are-supported-can-i-add-my-own-file-reader","title":"Sweet. What formats are supported? Can I add my own file reader?\u00b6","text":"<p>Yes! This is very good for special log files or custom json formats. Here's how you do it:</p> <p>(1) Go to all existing readers in the <code>tablite.core</code> and find the closest match.</p>"},{"location":"tutorial/#very-nice-how-about-exporting-data","title":"Very nice. How about exporting data?\u00b6","text":"<p>Just use .export</p>"},{"location":"tutorial/#cool-does-it-play-well-with-plotting-packages","title":"Cool. Does it play well with plotting packages?\u00b6","text":"<p>Yes. Here's an example you can copy and paste:</p>"},{"location":"tutorial/#i-like-sql-can-tablite-understand-sql","title":"I like sql. Can tablite understand SQL?\u00b6","text":"<p>Almost. You can use <code>table.to_sql</code> and tablite will return ANSI-92 compliant SQL.</p> <p>You can also create a table using <code>Table.from_sql</code> and tablite will consume ANSI-92 compliant SQL.</p>"},{"location":"tutorial/#but-what-do-i-do-if-im-about-to-run-out-of-memory","title":"But what do I do if I'm about to run out of memory?\u00b6","text":"<p>You wont. Every tablite table is backed by disk. The memory footprint of a table is only the metadata required to know the relationships between variable names and the datastructures.</p> <p>Let's do a comparison:</p>"},{"location":"tutorial/#conclusions","title":"Conclusions\u00b6","text":"<p>This concludes the mega-tutorial to <code>tablite</code>. There's nothing more to it. But oh boy it'll save a lot of time.</p> <p>Here's a summary of features:</p> <ul> <li>Everything a list can do.</li> <li>import csv*, fods, json, html, simple, rst, mediawiki, xlsx, xls, xlsm, csv, tsv, txt, ods using <code>Table.from_file(...)</code></li> <li>Iterate over rows or columns</li> <li>Create multikey <code>index</code>, <code>sort</code>, use <code>filter</code>, <code>any</code> and <code>all</code> to select. Perform <code>lookup</code> across tables including using custom functions.</li> <li>Perform multikey <code>joins</code> with other tables.</li> <li>Perform <code>groupby</code> and reorganise data as a <code>pivot</code> table with max, min, sum, first, last, count, unique, average, standard deviation, median and mode.</li> <li>Update tables with <code>+=</code> which automatically sorts out the columns - even if they're not in perfect order.</li> </ul>"},{"location":"tutorial/#faq","title":"FAQ\u00b6","text":"Question Answer I'm not in a notebook. Is there a nice way to view tables? Yes. <code>table.show()</code> prints the ascii version I'm looking for the equivalent to <code>apply</code> in pandas. Just use list comprehensions: <code>table[column] = [f(x) for x in table[column]</code> What about <code>map</code>? Just use the python function: <code>mapping = map(f, table[column name])</code> Is there a <code>where</code> function? It's called <code>any</code> or <code>all</code> like in python: <code>table.any(column_name &gt; 0)</code>. I like sql and sqlite. Can I use sql? Yes. Call <code>table.to_sql()</code> returns ANSI-92 SQL compliant table definition.You can use this in any SQL compliant engine. <p>| sometimes i need to clean up data with datetimes. Is there any tool to help with that? | Yes. Look at DataTypes.<code>DataTypes.round(value, multiple)</code> allows rounding of datetime.</p>"},{"location":"tutorial/#coming-to-tablite-from-pandas","title":"Coming to Tablite from Pandas\u00b6","text":"<p>If you're coming to Tablite from Pandas you will notice some differences.</p> <p>Here's the ultra short comparison to the documentation from Pandas called 10 minutes intro to pandas</p> <p>The tutorials provide the generic overview:</p> <ul> <li>pandas tutorial</li> <li>tablite tutorial</li> </ul> <p>Some key differences</p> topic Tablite Viewing data Just use <code>table.show()</code> in print outs, or if you're in a jupyter notebook just use the variable name <code>table</code> Selection Slicing works both on columns and rows, and you can filter using <code>any</code> or <code>all</code>:<code>table['A','B', 2:30:3].any(A=lambda x:x&gt;3)</code> to copy a table use: <code>t2 = t.copy()</code>This is a very fast deep copy, that has no memory overhead as tablites memory manager keeps track of the data. Missing data Tablite uses <code>mixed</code> column format for any format that isn't uniformTo get rid of rows with <code>None</code>s and <code>np.nan</code>s use any:<code>table.drop_na(None, np.nan)</code> Alternatively you can use replace: <code>table.replace(None,5)</code> following the syntax: <code>table.replace_missing_values(sources, target)</code> Operations Descriptive statistics are on a colum by column basis:<code>table['a'].statistics()</code>  the pandas function <code>df.apply</code> doesn't exist in tablite. Use a list comprehension instead. For example: <code>df.apply(np.cumsum)</code> is just <code>np.cumsum(t['A'])</code>  \"histogramming\" in tablite is per column: <code>table['a'].histogram()</code>  string methods? Just use a list comprehensions: <code>table['A', 'B'].any(A=lambda x: \"hello\" in x, B=lambda x: \"world\" in x)</code> Merge Concatenation: Just use <code>+</code> or <code>+=</code> as in <code>t1 = t2 + t3 += t4</code>. If the columns are out of order, tablite will sort the headers according to the order in the first table.If you're worried that the header mismatch use <code>t1.stack(t2)</code>  Joins are ANSI92 compliant: <code>t1.join(t2, &lt;...args...&gt;, join_type=...)</code>. Grouping Tablite supports multikey groupby using <code>from tablite import Groupby as gb</code>. <code>table.groupby(keys, functions)</code> Reshaping To reshape a table use <code>transpose</code>.  to perform pivot table like operations, use: <code>table.pivot(rows, columns, functions)</code> subtotals aside tablite will give you everything Excels pivot table can do. Time series To convert time series use a list comprehension.<code>t1['GMT'] = [timedelta(hours=1) + v for v in t1['date'] ]</code>  to generate a date range use:<code>from Tablite import daterange</code><code>t['date'] = date_range(start=2022/1/1, stop=2023/1/1, step=timedelta(days=1))</code> Categorical Pandas only seems to use this for sorting and grouping.  Tablite table has <code>.sort</code>, <code>.groupby</code> and <code>.pivot</code>  to achieve the same task. Plotting Import your favorite plotting package and feed it the values, such as:<code>import matplotlib.pyplot as plt</code> <code>plt.plot(t['a'],t['b'])</code> <code>plt.showw()</code> Import/Export Tablite supports the same import/export options as pandas.Tablite pegs the free memory before IO and can therefore process larger-than-RAM files. Tablite also guesses the datatypes for all ISOformats and uses multiprocessing and may therefore be faster. Should you want to inspect how guess works, use <code>from tools import guess</code> and try the function out. Gotchas None really. Should you come across something non-pythonic, then please post it on the issue list."},{"location":"reference/base/","title":"Base","text":""},{"location":"reference/base/#tablite.base","title":"<code>tablite.base</code>","text":""},{"location":"reference/base/#tablite.base-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.log","title":"<code>tablite.base.log = logging.getLogger(__name__)</code>  <code>module-attribute</code>","text":""},{"location":"reference/base/#tablite.base.file_registry","title":"<code>tablite.base.file_registry = set()</code>  <code>module-attribute</code>","text":""},{"location":"reference/base/#tablite.base-classes","title":"Classes","text":""},{"location":"reference/base/#tablite.base.SimplePage","title":"<code>tablite.base.SimplePage(id, path, len, py_dtype)</code>","text":"<p>             Bases: <code>object</code></p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(self, id, path, len, py_dtype) -&gt; None:\n    self.path = Path(path) / \"pages\" / f\"{id}.npy\"\n    self.len = len\n    self.dtype = py_dtype\n\n    self._incr_refcount()\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.SimplePage.ids","title":"<code>tablite.base.SimplePage.ids = count(start=1)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.refcounts","title":"<code>tablite.base.SimplePage.refcounts = {}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.autocleanup","title":"<code>tablite.base.SimplePage.autocleanup = True</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.path","title":"<code>tablite.base.SimplePage.path = Path(path) / 'pages' / f'{id}.npy'</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.len","title":"<code>tablite.base.SimplePage.len = len</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage.dtype","title":"<code>tablite.base.SimplePage.dtype = py_dtype</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.SimplePage-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.SimplePage.__setstate__","title":"<code>tablite.base.SimplePage.__setstate__(state)</code>","text":"<p>when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called</p> Source code in <code>tablite/base.py</code> <pre><code>def __setstate__(self, state):\n    \"\"\"\n    when an object is unpickled, say in a case of multi-processing,\n    object.__setstate__(state) is called instead of __init__, this means\n    we need to update page refcount as if constructor had been called\n    \"\"\"\n    self.__dict__.update(state)\n\n    self._incr_refcount()\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.next_id","title":"<code>tablite.base.SimplePage.next_id(path)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef next_id(cls, path):\n    path = Path(path)\n\n    while True:\n        _id = f\"{os.getpid()}-{next(cls.ids)}\"\n        _path = path / \"pages\" / f\"{_id}.npy\"\n\n        if not _path.exists():\n            break  # make sure we don't override existing pages if they are created outside of main thread\n\n    return _id\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__len__","title":"<code>tablite.base.SimplePage.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):\n    return self.len\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__repr__","title":"<code>tablite.base.SimplePage.__repr__() -&gt; str</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self) -&gt; str:\n    try:\n        return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n    except FileNotFoundError as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{type(e).__name__}&gt;)\"\n    except Exception as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{e}&gt;)\"\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__hash__","title":"<code>tablite.base.SimplePage.__hash__() -&gt; int</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __hash__(self) -&gt; int:\n    return hash(self.path)\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.owns","title":"<code>tablite.base.SimplePage.owns()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def owns(self):\n    parts = self.path.parts\n\n    return all((p in parts for p in Path(Config.pid).parts))\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.__del__","title":"<code>tablite.base.SimplePage.__del__()</code>","text":"<p>When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def __del__(self):\n    \"\"\"When python's reference count for an object is 0, python uses\n    it's garbage collector to remove the object and free the memory.\n    As tablite tables have columns and columns have page and pages have\n    data stored on disk, the space on disk must be freed up as well.\n    This __del__ override assures the cleanup of stored data.\n    \"\"\"\n    if not self.owns():\n        return\n\n    refcount = self.refcounts[self.path] = max(\n        self.refcounts.get(self.path, 0) - 1, 0\n    )\n\n    if refcount &gt; 0:\n        return\n\n    if self.autocleanup:\n        self.path.unlink(True)\n\n    del self.refcounts[self.path]\n</code></pre>"},{"location":"reference/base/#tablite.base.SimplePage.get","title":"<code>tablite.base.SimplePage.get()</code>","text":"<p>loads stored data</p> RETURNS DESCRIPTION <p>np.ndarray: stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def get(self):\n    \"\"\"loads stored data\n\n    Returns:\n        np.ndarray: stored data.\n    \"\"\"\n    array = load_numpy(self.path)\n    return MetaArray(array, array.dtype, py_dtype=self.dtype)\n</code></pre>"},{"location":"reference/base/#tablite.base.Page","title":"<code>tablite.base.Page(path, array)</code>","text":"<p>             Bases: <code>SimplePage</code></p> PARAMETER  DESCRIPTION <code>path</code> <p>working directory.</p> <p> TYPE: <code>Path</code> </p> <code>array</code> <p>data</p> <p> TYPE: <code>array</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(self, path, array) -&gt; None:\n    \"\"\"\n    Args:\n        path (Path): working directory.\n        array (np.array): data\n    \"\"\"\n    _id = self.next_id(path)\n\n    type_check(array, np.ndarray)\n\n    if Config.DISK_LIMIT &lt;= 0:\n        pass\n    else:\n        _, _, free = shutil.disk_usage(path)\n        if free - array.nbytes &lt; Config.DISK_LIMIT:\n            msg = \"\\n\".join(\n                [\n                    f\"Disk limit reached: Config.DISK_LIMIT = {Config.DISK_LIMIT:,} bytes.\",\n                    f\"array requires {array.nbytes:,} bytes, but only {free:,} bytes are free.\",\n                    \"To disable this check, use:\",\n                    \"&gt;&gt;&gt; from tablite.config import Config\",\n                    \"&gt;&gt;&gt; Config.DISK_LIMIT = 0\",\n                    \"To free space, clean up Config.workdir:\",\n                    f\"{Config.workdir}\",\n                ]\n            )\n            raise OSError(msg)\n\n    _len = len(array)\n    # type_check(array, MetaArray)\n    if not hasattr(array, \"metadata\"):\n        raise ValueError\n    _dtype = array.metadata[\"py_dtype\"]\n\n    super().__init__(_id, path, _len, _dtype)\n\n    np.save(self.path, array, allow_pickle=True, fix_imports=False)\n    log.debug(f\"Page saved: {self.path}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Page-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Page.ids","title":"<code>tablite.base.Page.ids = count(start=1)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.refcounts","title":"<code>tablite.base.Page.refcounts = {}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.autocleanup","title":"<code>tablite.base.Page.autocleanup = True</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.path","title":"<code>tablite.base.Page.path = Path(path) / 'pages' / f'{id}.npy'</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.len","title":"<code>tablite.base.Page.len = len</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page.dtype","title":"<code>tablite.base.Page.dtype = py_dtype</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Page-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Page.__setstate__","title":"<code>tablite.base.Page.__setstate__(state)</code>","text":"<p>when an object is unpickled, say in a case of multi-processing, object.setstate(state) is called instead of init, this means we need to update page refcount as if constructor had been called</p> Source code in <code>tablite/base.py</code> <pre><code>def __setstate__(self, state):\n    \"\"\"\n    when an object is unpickled, say in a case of multi-processing,\n    object.__setstate__(state) is called instead of __init__, this means\n    we need to update page refcount as if constructor had been called\n    \"\"\"\n    self.__dict__.update(state)\n\n    self._incr_refcount()\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.next_id","title":"<code>tablite.base.Page.next_id(path)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef next_id(cls, path):\n    path = Path(path)\n\n    while True:\n        _id = f\"{os.getpid()}-{next(cls.ids)}\"\n        _path = path / \"pages\" / f\"{_id}.npy\"\n\n        if not _path.exists():\n            break  # make sure we don't override existing pages if they are created outside of main thread\n\n    return _id\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__len__","title":"<code>tablite.base.Page.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):\n    return self.len\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__repr__","title":"<code>tablite.base.Page.__repr__() -&gt; str</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self) -&gt; str:\n    try:\n        return f\"{self.__class__.__name__}({self.path}, {self.get()})\"\n    except FileNotFoundError as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{type(e).__name__}&gt;)\"\n    except Exception as e:\n        return f\"{self.__class__.__name__}({self.path}, &lt;{e}&gt;)\"\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__hash__","title":"<code>tablite.base.Page.__hash__() -&gt; int</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __hash__(self) -&gt; int:\n    return hash(self.path)\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.owns","title":"<code>tablite.base.Page.owns()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def owns(self):\n    parts = self.path.parts\n\n    return all((p in parts for p in Path(Config.pid).parts))\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.__del__","title":"<code>tablite.base.Page.__del__()</code>","text":"<p>When python's reference count for an object is 0, python uses it's garbage collector to remove the object and free the memory. As tablite tables have columns and columns have page and pages have data stored on disk, the space on disk must be freed up as well. This del override assures the cleanup of stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def __del__(self):\n    \"\"\"When python's reference count for an object is 0, python uses\n    it's garbage collector to remove the object and free the memory.\n    As tablite tables have columns and columns have page and pages have\n    data stored on disk, the space on disk must be freed up as well.\n    This __del__ override assures the cleanup of stored data.\n    \"\"\"\n    if not self.owns():\n        return\n\n    refcount = self.refcounts[self.path] = max(\n        self.refcounts.get(self.path, 0) - 1, 0\n    )\n\n    if refcount &gt; 0:\n        return\n\n    if self.autocleanup:\n        self.path.unlink(True)\n\n    del self.refcounts[self.path]\n</code></pre>"},{"location":"reference/base/#tablite.base.Page.get","title":"<code>tablite.base.Page.get()</code>","text":"<p>loads stored data</p> RETURNS DESCRIPTION <p>np.ndarray: stored data.</p> Source code in <code>tablite/base.py</code> <pre><code>def get(self):\n    \"\"\"loads stored data\n\n    Returns:\n        np.ndarray: stored data.\n    \"\"\"\n    array = load_numpy(self.path)\n    return MetaArray(array, array.dtype, py_dtype=self.dtype)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column","title":"<code>tablite.base.Column(path, value=None)</code>","text":"<p>             Bases: <code>object</code></p> <p>Create Column</p> PARAMETER  DESCRIPTION <code>path</code> <p>path of table.yml (defaults: Config.pid_dir)</p> <p> TYPE: <code>Path</code> </p> <code>value</code> <p>Data to store. Defaults to None.</p> <p> TYPE: <code>Iterable</code> DEFAULT: <code>None</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(self, path, value=None) -&gt; None:\n    \"\"\"Create Column\n\n    Args:\n        path (Path): path of table.yml (defaults: Config.pid_dir)\n        value (Iterable, optional): Data to store. Defaults to None.\n    \"\"\"\n    self.path = path\n    self.pages = []  # keeps pointers to instances of Page\n    if value is not None:\n        self.extend(value)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.Column.path","title":"<code>tablite.base.Column.path = path</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Column.pages","title":"<code>tablite.base.Column.pages = []</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.Column-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.Column.__len__","title":"<code>tablite.base.Column.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):\n    return sum(len(p) for p in self.pages)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__repr__","title":"<code>tablite.base.Column.__repr__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self):\n    return f\"{self.__class__.__name__}({self.path}, {self[:]})\"\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.repaginate","title":"<code>tablite.base.Column.repaginate()</code>","text":"<p>resizes pages to Config.PAGE_SIZE</p> Source code in <code>tablite/base.py</code> <pre><code>def repaginate(self):\n    \"\"\"resizes pages to Config.PAGE_SIZE\"\"\"\n    from tablite.nimlite import repaginate as _repaginate\n\n    _repaginate(self)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.extend","title":"<code>tablite.base.Column.extend(value)</code>","text":"<p>extends the column.</p> PARAMETER  DESCRIPTION <code>value</code> <p>data</p> <p> TYPE: <code>ndarray</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def extend(self, value):  # USER FUNCTION.\n    \"\"\"extends the column.\n\n    Args:\n        value (np.ndarray): data\n    \"\"\"\n    if isinstance(value, Column):\n        self.pages.extend(value.pages[:])\n        return\n    elif isinstance(value, np.ndarray):\n        pass\n    elif isinstance(value, (list, tuple)):\n        value = list_to_np_array(value)\n    else:\n        raise TypeError(f\"Cannot extend Column with {type(value)}\")\n    type_check(value, np.ndarray)\n    for array in self._paginate(value):\n        self.pages.append(Page(path=self.path, array=array))\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.clear","title":"<code>tablite.base.Column.clear()</code>","text":"<p>clears the column. Like list().clear()</p> Source code in <code>tablite/base.py</code> <pre><code>def clear(self):\n    \"\"\"\n    clears the column. Like list().clear()\n    \"\"\"\n    self.pages.clear()\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.getpages","title":"<code>tablite.base.Column.getpages(item)</code>","text":"<p>public non-user function to identify any pages + slices of data to be retrieved given a slice (item)</p> PARAMETER  DESCRIPTION <code>item</code> <p>target slice of data</p> <p> TYPE: <code>(int, slice)</code> </p> RETURNS DESCRIPTION <p>list of pages/np.ndarrays.</p> <p>Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)] This helps, for example when creating a copy, as the copy can reference the pages 1 and 2 and only need to store the np.ndarray that is unique to it.</p> Source code in <code>tablite/base.py</code> <pre><code>def getpages(self, item):\n    \"\"\"public non-user function to identify any pages + slices\n    of data to be retrieved given a slice (item)\n\n    Args:\n        item (int,slice): target slice of data\n\n    Returns:\n        list of pages/np.ndarrays.\n\n    Example: [Page(1), Page(2), np.ndarray([4,5,6], int64)]\n    This helps, for example when creating a copy, as the copy\n    can reference the pages 1 and 2 and only need to store\n    the np.ndarray that is unique to it.\n    \"\"\"\n    # internal function\n    if isinstance(item, int):\n        if item &lt; 0:\n            item = len(self) + item\n        item = slice(item, item + 1, 1)\n\n    type_check(item, slice)\n    is_reversed = False if (item.step is None or item.step &gt; 0) else True\n\n    length = len(self)\n    scan_item = slice(*item.indices(length))\n    range_item = range(*item.indices(length))\n\n    pages = []\n    start, end = 0, 0\n    for page in self.pages:\n        start, end = end, end + page.len\n        if is_reversed:\n            if start &gt; scan_item.start:\n                break\n            if end &lt; scan_item.stop:\n                continue\n        else:\n            if start &gt; scan_item.stop:\n                break\n            if end &lt; scan_item.start:\n                continue\n        ro = intercept(range(start, end), range_item)\n        if len(ro) == 0:\n            continue\n        elif len(ro) == page.len:  # share the whole immutable page\n            pages.append(page)\n        else:  # fetch the slice and filter it.\n            search_slice = slice(ro.start - start, ro.stop - start, ro.step)\n            np_arr = load_numpy(page.path)\n            match = np_arr[search_slice]\n            pages.append(match)\n\n    if is_reversed:\n        pages.reverse()\n        for ix, page in enumerate(pages):\n            if isinstance(page, SimplePage):\n                data = page.get()\n                pages[ix] = np.flip(data)\n            else:\n                pages[ix] = np.flip(page)\n\n    return pages\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.iter_by_page","title":"<code>tablite.base.Column.iter_by_page()</code>","text":"<p>iterates over the column, page by page. This method minimizes the number of reads.</p> RETURNS DESCRIPTION <p>generator of tuple: start: int end: int data: np.ndarray</p> Source code in <code>tablite/base.py</code> <pre><code>def iter_by_page(self):\n    \"\"\"iterates over the column, page by page.\n    This method minimizes the number of reads.\n\n    Returns:\n        generator of tuple:\n            start: int\n            end: int\n            data: np.ndarray\n    \"\"\"\n    start, end = 0, 0\n    for page in self.pages:\n        start, end = end, end + page.len\n        yield start, end, page\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__getitem__","title":"<code>tablite.base.Column.__getitem__(item)</code>","text":"<p>gets numpy array.</p> PARAMETER  DESCRIPTION <code>item</code> <p>slice of column</p> <p> TYPE: <code>int OR slice</code> </p> RETURNS DESCRIPTION <p>np.ndarray: results as numpy array.</p> <p>Remember:</p> <pre><code>&gt;&gt;&gt; R = np.array([0,1,2,3,4,5])\n&gt;&gt;&gt; R[3]\n3\n&gt;&gt;&gt; R[3:4]\narray([3])\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __getitem__(self, item):  # USER FUNCTION.\n    \"\"\"gets numpy array.\n\n    Args:\n        item (int OR slice): slice of column\n\n    Returns:\n        np.ndarray: results as numpy array.\n\n    Remember:\n    ```\n    &gt;&gt;&gt; R = np.array([0,1,2,3,4,5])\n    &gt;&gt;&gt; R[3]\n    3\n    &gt;&gt;&gt; R[3:4]\n    array([3])\n    ```\n    \"\"\"\n    result = []\n    for element in self.getpages(item):\n        if isinstance(element, SimplePage):\n            result.append(element.get())\n        else:\n            result.append(element)\n\n    if result:\n        arr = np_type_unify(result)\n    else:\n        arr = np.array([])\n\n    if isinstance(item, int):\n        if len(arr) == 0:\n            raise IndexError(\n                f\"index {item} is out of bounds for axis 0 with size {len(self)}\"\n            )\n        return numpy_to_python(arr[0])\n    else:\n        return arr\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__setitem__","title":"<code>tablite.base.Column.__setitem__(key, value)</code>","text":"<p>sets values.</p> PARAMETER  DESCRIPTION <code>key</code> <p>selector</p> <p> TYPE: <code>(int, slice)</code> </p> <code>value</code> <p>values to insert</p> <p> TYPE: <code>any</code> </p> RAISES DESCRIPTION <code>KeyError</code> <p>Following normal slicing rules</p> Source code in <code>tablite/base.py</code> <pre><code>def __setitem__(self, key, value):  # USER FUNCTION.\n    \"\"\"sets values.\n\n    Args:\n        key (int,slice): selector\n        value (any): values to insert\n\n    Raises:\n        KeyError: Following normal slicing rules\n    \"\"\"\n    if isinstance(key, int):\n        self._setitem_integer_key(key, value)\n\n    elif isinstance(key, slice):\n        if not isinstance(value, np.ndarray):\n            value = list_to_np_array(value)\n        type_check(value, np.ndarray)\n\n        if key.start is None and key.stop is None and key.step in (None, 1):\n            self._setitem_replace_all(key, value)\n        elif key.start is not None and key.stop is None and key.step in (None, 1):\n            self._setitem_extend(key, value)\n        elif key.stop is not None and key.start is None and key.step in (None, 1):\n            self._setitem_prextend(key, value)\n        elif (\n            key.step in (None, 1) and key.start is not None and key.stop is not None\n        ):\n            self._setitem_insert(key, value)\n        elif key.step not in (None, 1):\n            self._setitem_update(key, value)\n        else:\n            raise KeyError(f\"bad key: {key}\")\n    else:\n        raise KeyError(f\"bad key: {key}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__delitem__","title":"<code>tablite.base.Column.__delitem__(key)</code>","text":"<p>deletes items selected by key</p> PARAMETER  DESCRIPTION <code>key</code> <p>selector</p> <p> TYPE: <code>(int, slice)</code> </p> RAISES DESCRIPTION <code>KeyError</code> <p>following normal slicing rules.</p> Source code in <code>tablite/base.py</code> <pre><code>def __delitem__(self, key):  # USER FUNCTION\n    \"\"\"deletes items selected by key\n\n    Args:\n        key (int,slice): selector\n\n    Raises:\n        KeyError: following normal slicing rules.\n    \"\"\"\n    if isinstance(key, int):\n        self._del_by_int(key)\n    elif isinstance(key, slice):\n        self._del_by_slice(key)\n    else:\n        raise KeyError(f\"bad key: {key}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.get_by_indices","title":"<code>tablite.base.Column.get_by_indices(indices: Union[List[int], np.ndarray]) -&gt; np.ndarray</code>","text":"<p>retrieves values from column given a set of indices.</p> PARAMETER  DESCRIPTION <code>indices</code> <p>targets</p> <p> TYPE: <code>array</code> </p> <p>This method uses np.take, is faster than iterating over rows. Examples:</p> <pre><code>&gt;&gt;&gt; indices = np.array(list(range(3,700_700, 426)))\n&gt;&gt;&gt; arr = np.array(list(range(2_000_000)))\nPythonic:\n&gt;&gt;&gt; [v for i,v in enumerate(arr) if i in indices]\nNumpyionic:\n&gt;&gt;&gt; np.take(arr, indices)\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def get_by_indices(self, indices: Union[List[int], np.ndarray]) -&gt; np.ndarray:\n    \"\"\"retrieves values from column given a set of indices.\n\n    Args:\n        indices (np.array): targets\n\n    This method uses np.take, is faster than iterating over rows.\n    Examples:\n    ```\n    &gt;&gt;&gt; indices = np.array(list(range(3,700_700, 426)))\n    &gt;&gt;&gt; arr = np.array(list(range(2_000_000)))\n    Pythonic:\n    &gt;&gt;&gt; [v for i,v in enumerate(arr) if i in indices]\n    Numpyionic:\n    &gt;&gt;&gt; np.take(arr, indices)\n    ```\n    \"\"\"\n    type_check(indices, np.ndarray)\n\n    dtypes = set()\n    values = np.empty(\n        indices.shape, dtype=object\n    )  # placeholder for the indexed values.\n\n    for start, end, page in self.iter_by_page():\n        range_match = np.asarray(((indices &gt;= start) &amp; (indices &lt; end)) | (indices == -1)).nonzero()[0]\n        if len(range_match):\n            # only fetch the data if there's a range match!\n            data = page.get() \n            sub_index = np.take(indices, range_match)\n            # sub_index2 otherwise will raise index error where len(data) &gt; (-1 - start)\n            # so the clause below is required:\n            if len(data) &gt; (-1 - start):\n                sub_index = np.where(sub_index == -1, -1, sub_index - start)\n            arr = np.take(data, sub_index)\n            dtypes.add(arr.dtype)\n            np.put(values, range_match, arr)\n\n    if len(dtypes) == 1:  # simplify the datatype\n        dtype = next(iter(dtypes))\n        values = np.array(values, dtype=dtype)\n    return values\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__iter__","title":"<code>tablite.base.Column.__iter__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __iter__(self):  # USER FUNCTION.\n    for page in self.pages:\n        data = page.get()\n        for value in data:\n            yield value\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__eq__","title":"<code>tablite.base.Column.__eq__(other)</code>","text":"<p>compares two columns. Like <code>list1 == list2</code></p> Source code in <code>tablite/base.py</code> <pre><code>def __eq__(self, other):  # USER FUNCTION.\n    \"\"\"\n    compares two columns. Like `list1 == list2`\n    \"\"\"\n    if len(self) != len(other):  # quick cheap check.\n        return False\n\n    if isinstance(other, (list, tuple)):\n        return all(a == b for a, b in zip(self[:], other))\n\n    elif isinstance(other, Column):\n        if self.pages == other.pages:  # special case.\n            return True\n\n        # are the pages of same size?\n        if len(self.pages) == len(other.pages):\n            if [p.len for p in self.pages] == [p.len for p in other.pages]:\n                for a, b in zip(self.pages, other.pages):\n                    if not (a.get() == b.get()).all():\n                        return False\n                return True\n        # to bad. Element comparison it is then:\n        for a, b in zip(iter(self), iter(other)):\n            if a != b:\n                return False\n        return True\n\n    elif isinstance(other, np.ndarray):\n        start, end = 0, 0\n        for p in self.pages:\n            start, end = end, end + p.len\n            if not (p.get() == other[start:end]).all():\n                return False\n        return True\n    else:\n        raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__ne__","title":"<code>tablite.base.Column.__ne__(other)</code>","text":"<p>compares two columns. Like <code>list1 != list2</code></p> Source code in <code>tablite/base.py</code> <pre><code>def __ne__(self, other):  # USER FUNCTION\n    \"\"\"\n    compares two columns. Like `list1 != list2`\n    \"\"\"\n    if len(self) != len(other):  # quick cheap check.\n        return True\n\n    if isinstance(other, (list, tuple)):\n        return any(a != b for a, b in zip(self[:], other))\n\n    elif isinstance(other, Column):\n        if self.pages == other.pages:  # special case.\n            return False\n\n        # are the pages of same size?\n        if len(self.pages) == len(other.pages):\n            if [p.len for p in self.pages] == [p.len for p in other.pages]:\n                for a, b in zip(self.pages, other.pages):\n                    if not (a.get() == b.get()).all():\n                        return True\n                return False\n        # to bad. Element comparison it is then:\n        for a, b in zip(iter(self), iter(other)):\n            if a != b:\n                return True\n        return False\n\n    elif isinstance(other, np.ndarray):\n        start, end = 0, 0\n        for p in self.pages:\n            start, end = end, end + p.len\n            if (p.get() != other[start:end]).any():\n                return True\n        return False\n    else:\n        raise TypeError(f\"Cannot compare {self.__class__} with {type(other)}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.copy","title":"<code>tablite.base.Column.copy()</code>","text":"<p>returns deep=copy of Column</p> RETURNS DESCRIPTION <p>Column</p> Source code in <code>tablite/base.py</code> <pre><code>def copy(self):\n    \"\"\"returns deep=copy of Column\n\n    Returns:\n        Column\n    \"\"\"\n    cp = Column(path=self.path)\n    cp.pages = self.pages[:]\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__copy__","title":"<code>tablite.base.Column.__copy__()</code>","text":"<p>see copy</p> Source code in <code>tablite/base.py</code> <pre><code>def __copy__(self):\n    \"\"\"see copy\"\"\"\n    return self.copy()\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__imul__","title":"<code>tablite.base.Column.__imul__(other)</code>","text":"<p>Repeats instance of column N times. Like list() * N</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; one = Column(data=[1,2])\n&gt;&gt;&gt; one *= 5\n&gt;&gt;&gt; one\n[1,2, 1,2, 1,2, 1,2, 1,2]\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __imul__(self, other):\n    \"\"\"\n    Repeats instance of column N times. Like list() * N\n\n    Example:\n    ```\n    &gt;&gt;&gt; one = Column(data=[1,2])\n    &gt;&gt;&gt; one *= 5\n    &gt;&gt;&gt; one\n    [1,2, 1,2, 1,2, 1,2, 1,2]\n    ```\n    \"\"\"\n    if not (isinstance(other, int) and other &gt; 0):\n        raise TypeError(\n            f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    self.pages = self.pages[:] * other\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__mul__","title":"<code>tablite.base.Column.__mul__(other)</code>","text":"<p>Repeats instance of column N times. Like list() * N</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; one = Column(data=[1,2])\n&gt;&gt;&gt; two = one * 5\n&gt;&gt;&gt; two\n[1,2, 1,2, 1,2, 1,2, 1,2]\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __mul__(self, other):\n    \"\"\"\n    Repeats instance of column N times. Like list() * N\n\n    Example:\n    ```\n    &gt;&gt;&gt; one = Column(data=[1,2])\n    &gt;&gt;&gt; two = one * 5\n    &gt;&gt;&gt; two\n    [1,2, 1,2, 1,2, 1,2, 1,2]\n    ```\n    \"\"\"\n    if not isinstance(other, int):\n        raise TypeError(\n            f\"a column can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    cp = self.copy()\n    cp *= other\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__iadd__","title":"<code>tablite.base.Column.__iadd__(other)</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __iadd__(self, other):\n    if isinstance(other, (list, tuple)):\n        other = list_to_np_array(other)\n        self.extend(other)\n    elif isinstance(other, Column):\n        self.pages.extend(other.pages[:])\n    else:\n        raise TypeError(f\"{type(other)} not supported.\")\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.__contains__","title":"<code>tablite.base.Column.__contains__(item)</code>","text":"<p>determines if item is in the Column. Similar to <code>'x' in ['a','b','c']</code> returns boolean</p> PARAMETER  DESCRIPTION <code>item</code> <p>value to search for</p> <p> TYPE: <code>any</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>True if item exists in column.</p> Source code in <code>tablite/base.py</code> <pre><code>def __contains__(self, item):\n    \"\"\"determines if item is in the Column.\n    Similar to `'x' in ['a','b','c']`\n    returns boolean\n\n    Args:\n        item (any): value to search for\n\n    Returns:\n        bool: True if item exists in column.\n    \"\"\"\n    for page in set(self.pages):\n        if item in page.get():  # x in np.ndarray([...]) uses np.any(arr, value)\n            return True\n    return False\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.remove_all","title":"<code>tablite.base.Column.remove_all(*values)</code>","text":"<p>removes all values of <code>values</code></p> Source code in <code>tablite/base.py</code> <pre><code>def remove_all(self, *values):\n    \"\"\"\n    removes all values of `values`\n    \"\"\"\n    type_check(values, tuple)\n    if isinstance(values[0], tuple):\n        values = values[0]\n    to_remove = list_to_np_array(values)\n    for index, page in enumerate(self.pages):\n        data = page.get()\n        bitmask = np.isin(data, to_remove)  # identify elements to remove.\n        if bitmask.any():\n            bitmask = np.invert(bitmask)  # turn bitmask around to keep.\n            new_data = np.compress(bitmask, data)\n            new_page = Page(self.path, new_data)\n            self.pages[index] = new_page\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.replace","title":"<code>tablite.base.Column.replace(mapping)</code>","text":"<p>replaces values using a mapping.</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>{value to replace: new value, ...}</p> <p> TYPE: <code>dict</code> </p> <p>Example:</p> <pre><code>&gt;&gt;&gt; t = Table(columns={'A': [1,2,3,4]})\n&gt;&gt;&gt; t['A'].replace({2:20,4:40})\n&gt;&gt;&gt; t[:]\nnp.ndarray([1,20,3,40])\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def replace(self, mapping):\n    \"\"\"\n    replaces values using a mapping.\n\n    Args:\n        mapping (dict): {value to replace: new value, ...}\n\n    Example:\n    ```\n    &gt;&gt;&gt; t = Table(columns={'A': [1,2,3,4]})\n    &gt;&gt;&gt; t['A'].replace({2:20,4:40})\n    &gt;&gt;&gt; t[:]\n    np.ndarray([1,20,3,40])\n    ```\n    \"\"\"\n    type_check(mapping, dict)\n    to_replace = np.array(list(mapping.keys()))\n    for index, page in enumerate(self.pages):\n        data = page.get()\n        bitmask = np.isin(data, to_replace)  # identify elements to replace.\n        if bitmask.any():\n            warray = np.compress(bitmask, data)\n            py_dtype = page.dtype\n            for ix, v in enumerate(warray):\n                old_py_val = numpy_to_python(v)\n                new_py_val = mapping[old_py_val]\n                old_dt = type(old_py_val)\n                new_dt = type(new_py_val)\n\n                warray[ix] = new_py_val\n\n                py_dtype[new_dt] = py_dtype.get(new_dt, 0) + 1\n                py_dtype[old_dt] = py_dtype.get(old_dt, 0) - 1\n\n                if py_dtype[old_dt] &lt;= 0:\n                    del py_dtype[old_dt]\n\n            data[bitmask] = warray\n            self.pages[index] = Page(path=self.path, array=data)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.types","title":"<code>tablite.base.Column.types()</code>","text":"<p>returns dict with python datatypes</p> RETURNS DESCRIPTION <code>dict</code> <p>frequency of occurrence of python datatypes</p> Source code in <code>tablite/base.py</code> <pre><code>def types(self):\n    \"\"\"\n    returns dict with python datatypes\n\n    Returns:\n        dict: frequency of occurrence of python datatypes\n    \"\"\"\n    d = Counter()\n    for page in self.pages:\n        assert isinstance(page.dtype, dict)\n        d += page.dtype\n    return dict(d)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.index","title":"<code>tablite.base.Column.index()</code>","text":"<p>returns dict with { unique entry : list of indices }</p> <p>example:</p> <pre><code>&gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n&gt;&gt;&gt; c.index()\n{'a':[0,2], 'b': [1,4], 'c': [3]}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def index(self):\n    \"\"\"\n    returns dict with { unique entry : list of indices }\n\n    example:\n    ```\n    &gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n    &gt;&gt;&gt; c.index()\n    {'a':[0,2], 'b': [1,4], 'c': [3]}\n    ```\n    \"\"\"\n    d = defaultdict(list)\n    for ix, v in enumerate(self.__iter__()):\n        d[v].append(ix)\n    return dict(d)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.unique","title":"<code>tablite.base.Column.unique()</code>","text":"<p>returns unique list of values.</p> <p>example:</p> <pre><code>&gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n&gt;&gt;&gt; c.unqiue()\n['a','b','c']\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def unique(self):\n    \"\"\"\n    returns unique list of values.\n\n    example:\n    ```\n    &gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n    &gt;&gt;&gt; c.unqiue()\n    ['a','b','c']\n    ```\n    \"\"\"\n    arrays = []\n    for page in set(self.pages):\n        try:  # when it works, numpy is fast...\n            arrays.append(np.unique(page.get()))\n        except TypeError:  # ...but np.unique cannot handle Nones.\n            arrays.append(multitype_set(page.get()))\n    union = np_type_unify(arrays)\n    try:\n        return np.unique(union)\n    except MemoryError:\n        return np.array(set(union))\n    except TypeError:\n        return multitype_set(union)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.histogram","title":"<code>tablite.base.Column.histogram()</code>","text":"<p>returns 2 arrays: unique elements and count of each element</p> <p>example:</p> <pre><code>&gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n&gt;&gt;&gt; c.histogram()\n{'a':2,'b':2,'c':1}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def histogram(self):\n    \"\"\"\n    returns 2 arrays: unique elements and count of each element\n\n    example:\n    ```\n    &gt;&gt;&gt; c = Column(data=['a','b','a','c','b'])\n    &gt;&gt;&gt; c.histogram()\n    {'a':2,'b':2,'c':1}\n    ```\n    \"\"\"\n    d = defaultdict(int)\n    for page in self.pages:\n        try:\n            uarray, carray = np.unique(page.get(), return_counts=True)\n        except TypeError:\n            uarray = page.get()\n            carray = repeat(1, len(uarray))\n\n        for i, c in zip(uarray, carray):\n            v = numpy_to_python(i)\n            d[(type(v), v)] += numpy_to_python(c)\n    u = [v for _, v in d.keys()]\n    c = list(d.values())\n    return u, c  # unique, counts\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.statistics","title":"<code>tablite.base.Column.statistics()</code>","text":"<p>provides summary statistics.</p> RETURNS DESCRIPTION <code>dict</code> <p>returns dict with:</p> <ul> <li>min (int/float, length of str, date)</li> </ul> <ul> <li>max (int/float, length of str, date)</li> </ul> <ul> <li>mean (int/float, length of str, date)</li> </ul> <ul> <li>median (int/float, length of str, date)</li> </ul> <ul> <li>stdev (int/float, length of str, date)</li> </ul> <ul> <li>mode (int/float, length of str, date)</li> </ul> <ul> <li>distinct (int/float, length of str, date)</li> </ul> <ul> <li>iqr (int/float, length of str, date)</li> </ul> <ul> <li>sum (int/float, length of str, date)</li> </ul> <ul> <li>histogram (see .histogram)</li> </ul> Source code in <code>tablite/base.py</code> <pre><code>def statistics(self):\n    \"\"\"provides summary statistics.\n\n    Returns:\n        dict: returns dict with:\n        - min (int/float, length of str, date)\n        - max (int/float, length of str, date)\n        - mean (int/float, length of str, date)\n        - median (int/float, length of str, date)\n        - stdev (int/float, length of str, date)\n        - mode (int/float, length of str, date)\n        - distinct (int/float, length of str, date)\n        - iqr (int/float, length of str, date)\n        - sum (int/float, length of str, date)\n        - histogram (see .histogram)\n    \"\"\"\n    values, counts = self.histogram()\n    return summary_statistics(values, counts)\n</code></pre>"},{"location":"reference/base/#tablite.base.Column.count","title":"<code>tablite.base.Column.count(item)</code>","text":"<p>counts appearances of item in column.</p> <p>Note that in python, <code>True == 1</code> and <code>False == 0</code>, whereby the following difference occurs:</p> <p>in python:</p> <pre><code>&gt;&gt;&gt; L = [1, True]\n&gt;&gt;&gt; L.count(True)\n2\n</code></pre> <p>in tablite:</p> <pre><code>&gt;&gt;&gt; t = Table({'L': [1,True]})\n&gt;&gt;&gt; t['L'].count(True)\n1\n</code></pre> PARAMETER  DESCRIPTION <code>item</code> <p>target item</p> <p> TYPE: <code>Any</code> </p> RETURNS DESCRIPTION <code>int</code> <p>number of occurrences of item.</p> Source code in <code>tablite/base.py</code> <pre><code>def count(self, item):\n    \"\"\"counts appearances of item in column.\n\n    Note that in python, `True == 1` and `False == 0`,\n    whereby the following difference occurs:\n\n    in python:\n    ```\n    &gt;&gt;&gt; L = [1, True]\n    &gt;&gt;&gt; L.count(True)\n    2\n    ```\n    in tablite:\n    ```\n    &gt;&gt;&gt; t = Table({'L': [1,True]})\n    &gt;&gt;&gt; t['L'].count(True)\n    1\n    ```\n\n    Args:\n        item (Any): target item\n\n    Returns:\n        int: number of occurrences of item.\n    \"\"\"\n    result = 0\n    for page in self.pages:\n        data = page.get()\n        if data.dtype != \"O\":\n            result += np.nonzero(page.get() == item)[0].shape[0]\n            # what happens here ---^ below:\n            # arr = page.get()\n            # &gt;&gt;&gt; arr\n            # array([1,2,3,4,3], int64)\n            # &gt;&gt;&gt; (arr == 3)\n            # array([False, False,  True, False,  True])\n            # &gt;&gt;&gt; np.nonzero(arr==3)\n            # (array([2,4], dtype=int64), )  &lt;-- tuple!\n            # &gt;&gt;&gt; np.nonzero(page.get() == item)[0]\n            # array([2,4])\n            # &gt;&gt;&gt; np.nonzero(page.get() == item)[0].shape\n            # (2, )\n            # &gt;&gt;&gt; np.nonzero(page.get() == item)[0].shape[0]\n            # 2\n        else:\n            result += sum(1 for i in data if type(i) == type(item) and i == item)\n    return result\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable","title":"<code>tablite.base.BaseTable(columns: [dict, None] = None, headers: [list, None] = None, rows: [list, None] = None, _path: [Path, None] = None)</code>","text":"<p>             Bases: <code>object</code></p> <p>creates Table</p> PARAMETER  DESCRIPTION <code>EITHER</code> <p>columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})</p> <p> </p> <code>_path</code> <p>path to main process working directory.</p> <p> TYPE: <code>Path</code> DEFAULT: <code>None</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __init__(\n    self,\n    columns: [dict, None] = None,\n    headers: [list, None] = None,\n    rows: [list, None] = None,\n    _path: [Path, None] = None,\n) -&gt; None:\n    \"\"\"creates Table\n\n    Args:\n        EITHER:\n            columns (dict, optional): dict with column names as keys, values as lists.\n            Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n        OR\n            headers (list of strings, optional): list of column names.\n            rows (list of tuples or lists, optional): values for columns\n            Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n\n        _path (pathlib.Path, optional): path to main process working directory.\n    \"\"\"\n    if _path is None:\n        if self._pid_dir is None:\n            self._pid_dir = Path(Config.workdir) / Config.pid\n            if not self._pid_dir.exists():\n                self._pid_dir.mkdir()\n                (self._pid_dir / \"pages\").mkdir()\n            register(self._pid_dir)\n\n        _path = Path(self._pid_dir)\n        # if path exists under the given PID it will be overwritten.\n        # this can only happen if the process previously was SIGKILLed.\n    type_check(_path, Path)\n    self.path = _path  # filename used during multiprocessing.\n    self.columns = {}  # maps colunn names to instances of Column.\n\n    # user friendly features.\n    if columns and any((headers, rows)):\n        raise ValueError(\"Either columns as dict OR headers and rows. Not both.\")\n\n    if headers and rows:\n        rotated = list(zip(*rows))\n        columns = {k: v for k, v in zip(headers, rotated)}\n\n    if columns:\n        type_check(columns, dict)\n        for k, v in columns.items():\n            self.__setitem__(k, v)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable-attributes","title":"Attributes","text":""},{"location":"reference/base/#tablite.base.BaseTable.path","title":"<code>tablite.base.BaseTable.path = _path</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.BaseTable.columns","title":"<code>tablite.base.BaseTable.columns = {}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/base/#tablite.base.BaseTable.rows","title":"<code>tablite.base.BaseTable.rows</code>  <code>property</code>","text":"<p>enables row based iteration in python types.</p> <p>Example:</p> <pre><code>for row in Table.rows:\n    print(row)\n</code></pre> <p>Yields:     tuple: values is same order as columns.</p>"},{"location":"reference/base/#tablite.base.BaseTable-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.BaseTable.__str__","title":"<code>tablite.base.BaseTable.__str__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __str__(self):  # USER FUNCTION.\n    return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__repr__","title":"<code>tablite.base.BaseTable.__repr__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self):\n    return self.__str__()\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.nbytes","title":"<code>tablite.base.BaseTable.nbytes()</code>","text":"<p>finds the total bytes of the table on disk</p> RETURNS DESCRIPTION <code>tuple</code> <p>int: real bytes used on disk int: total bytes used if flattened</p> Source code in <code>tablite/base.py</code> <pre><code>def nbytes(self):  # USER FUNCTION.\n    \"\"\"finds the total bytes of the table on disk\n\n    Returns:\n        tuple:\n            int: real bytes used on disk\n            int: total bytes used if flattened\n    \"\"\"\n    real = {}\n    total = 0\n    for column in self.columns.values():\n        for page in set(column.pages):\n            real[page] = page.path.stat().st_size\n        for page in column.pages:\n            total += real[page]\n    return sum(real.values()), total\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.items","title":"<code>tablite.base.BaseTable.items()</code>","text":"<p>returns table as dict</p> RETURNS DESCRIPTION <code>dict</code> <p>Table as dict <code>{column_name: [values], ...}</code></p> Source code in <code>tablite/base.py</code> <pre><code>def items(self):  # USER FUNCTION.\n    \"\"\"returns table as dict\n\n    Returns:\n        dict: Table as dict `{column_name: [values], ...}`\n    \"\"\"\n    return {\n        name: column[:].tolist() for name, column in self.columns.items()\n    }.items()\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__delitem__","title":"<code>tablite.base.BaseTable.__delitem__(key)</code>","text":"<p>Examples:</p> <pre><code>&gt;&gt;&gt; del table['a']  # removes column 'a'\n&gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __delitem__(self, key):  # USER FUNCTION.\n    \"\"\"\n    Examples:\n    ```\n    &gt;&gt;&gt; del table['a']  # removes column 'a'\n    &gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n    ```\n    \"\"\"\n    if isinstance(key, (int, slice)):\n        for column in self.columns.values():\n            del column[key]\n    elif key in self.columns:\n        del self.columns[key]\n    else:\n        raise KeyError(f\"Key not found: {key}\")\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__setitem__","title":"<code>tablite.base.BaseTable.__setitem__(key, value)</code>","text":"<p>table behaves like a dict. Args:     key (str or hashable): column name     value (iterable): list, tuple or nd.array with values.</p> <p>As Table now accepts the keyword <code>columns</code> as a dict:</p> <pre><code>&gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n</code></pre> <p>and the header/data combinations:</p> <pre><code>&gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n</code></pre> <p>This has the side-benefit that tuples now can be used as headers.</p> Source code in <code>tablite/base.py</code> <pre><code>def __setitem__(self, key, value):  # USER FUNCTION\n    \"\"\"table behaves like a dict.\n    Args:\n        key (str or hashable): column name\n        value (iterable): list, tuple or nd.array with values.\n\n    As Table now accepts the keyword `columns` as a dict:\n    ```\n    &gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n    ```\n    and the header/data combinations:\n    ```\n    &gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n    ```\n    This has the side-benefit that tuples now can be used as headers.\n    \"\"\"\n    if value is None:\n        self.columns[key] = Column(self.path, value=None)\n    elif isinstance(value, (list, tuple)):\n        value = list_to_np_array(value)\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, (np.ndarray)):\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, Column):\n        self.columns[key] = value\n    else:\n        raise TypeError(f\"{type(value)} not supported.\")\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__getitem__","title":"<code>tablite.base.BaseTable.__getitem__(keys)</code>","text":"<p>Enables selection of columns and rows</p> PARAMETER  DESCRIPTION <code>keys</code> <p> TYPE: <code>column name, integer or slice</code> </p> <code>Examples</code> <p> </p> <code>&gt;&gt;&gt;</code> <p>10]                        selects first 10 rows from all columns</p> <p> TYPE: <code>table[</code> </p> <code>&gt;&gt;&gt;</code> <p>20:3] selects column 'b' and 'c' and 'a' twice for a slice.</p> <p> TYPE: <code>table['b', 'a', 'a', 'c', 2</code> </p> <p>Raises:     KeyError: if key is not found.     TypeError: if key is not a string, integer or slice.</p> RETURNS DESCRIPTION <code>Table</code> <p>returns columns in same order as selection.</p> Source code in <code>tablite/base.py</code> <pre><code>def __getitem__(self, keys):  # USER FUNCTION\n    \"\"\"\n    Enables selection of columns and rows\n\n    Args:\n        keys (column name, integer or slice):\n        Examples:\n        ```\n        &gt;&gt;&gt; table['a']                        selects column 'a'\n        &gt;&gt;&gt; table[3]                          selects row 3 as a tuple.\n        &gt;&gt;&gt; table[:10]                        selects first 10 rows from all columns\n        &gt;&gt;&gt; table['a','b', slice(3,20,2)]     selects a slice from columns 'a' and 'b'\n        &gt;&gt;&gt; table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n        &gt;&gt;&gt; table[('b', 'a', 'a', 'c')]       selects columns 'b', 'a', 'a', and 'c' using a tuple.\n        ```\n    Raises:\n        KeyError: if key is not found.\n        TypeError: if key is not a string, integer or slice.\n\n    Returns:\n        Table: returns columns in same order as selection.\n    \"\"\"\n\n    if not isinstance(keys, tuple):\n        if isinstance(keys, list):\n            keys = tuple(keys)\n        else:\n            keys = (keys,)\n    if isinstance(keys[0], tuple):\n        keys = tuple(list(chain(*keys)))\n\n    integers = [i for i in keys if isinstance(i, int)]\n    if len(integers) == len(keys) == 1:  # return a single tuple.\n        keys = [slice(keys[0])]\n\n    column_names = [i for i in keys if isinstance(i, str)]\n    column_names = list(self.columns) if not column_names else column_names\n    not_found = [name for name in column_names if name not in self.columns]\n    if not_found:\n        raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n    slices = [i for i in keys if isinstance(i, slice)]\n    slc = slice(0, len(self)) if not slices else slices[0]\n\n    if (\n        len(slices) == 0 and len(column_names) == 1\n    ):  # e.g. tbl['a'] or tbl['a'][:10]\n        col = self.columns[column_names[0]]\n        if slices:\n            return col[slc]  # return slice from column as list of values\n        else:\n            return col  # return whole column\n\n    elif len(integers) == 1:  # return a single tuple.\n        row_no = integers[0]\n        slc = slice(row_no, row_no + 1)\n        return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n    elif not slices:  # e.g. new table with N whole columns.\n        return self.__class__(\n            columns={name: self.columns[name] for name in column_names}\n        )\n\n    else:  # e.g. new table from selection of columns and slices.\n        t = self.__class__()\n        for name in column_names:\n            column = self.columns[name]\n\n            new_column = Column(t.path)  # create new Column.\n            for item in column.getpages(slc):\n                if isinstance(item, np.ndarray):\n                    new_column.extend(item)  # extend subslice (expensive)\n                elif isinstance(item, SimplePage):\n                    new_column.pages.append(item)  # extend page (cheap)\n                else:\n                    raise TypeError(f\"Bad item: {item}\")\n\n            # below:\n            # set the new column directly on t.columns.\n            # Do not use t[name] as that triggers __setitem__ again.\n            t.columns[name] = new_column\n\n        return t\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__len__","title":"<code>tablite.base.BaseTable.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):  # USER FUNCTION.\n    if not self.columns:\n        return 0\n    return max(len(c) for c in self.columns.values())\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__eq__","title":"<code>tablite.base.BaseTable.__eq__(other) -&gt; bool</code>","text":"<p>Determines if two tables have identical content.</p> PARAMETER  DESCRIPTION <code>other</code> <p>table for comparison</p> <p> TYPE: <code>Table</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>True if tables are identical.</p> <p> TYPE: <code>bool</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __eq__(self, other) -&gt; bool:  # USER FUNCTION.\n    \"\"\"Determines if two tables have identical content.\n\n    Args:\n        other (Table): table for comparison\n\n    Returns:\n        bool: True if tables are identical.\n    \"\"\"\n    if isinstance(other, dict):\n        return self.items() == other.items()\n    if not isinstance(other, BaseTable):\n        return False\n    if id(self) == id(other):\n        return True\n    if len(self) != len(other):\n        return False\n    if len(self) == len(other) == 0:\n        return True\n    if self.columns.keys() != other.columns.keys():\n        return False\n    for name, col in self.columns.items():\n        if not (col == other.columns[name]):\n            return False\n    return True\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.clear","title":"<code>tablite.base.BaseTable.clear()</code>","text":"<p>clears the table. Like dict().clear()</p> Source code in <code>tablite/base.py</code> <pre><code>def clear(self):  # USER FUNCTION.\n    \"\"\"clears the table. Like dict().clear()\"\"\"\n    self.columns.clear()\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.save","title":"<code>tablite.base.BaseTable.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1)</code>","text":"<p>saves table to compressed tpz file.</p> PARAMETER  DESCRIPTION <code>path</code> <p>file destination.</p> <p> TYPE: <code>Path</code> </p> <code>compression_method</code> <p>See zipfile compression methods. Defaults to ZIP_DEFLATED.</p> <p> DEFAULT: <code>ZIP_DEFLATED</code> </p> <code>compression_level</code> <p>See zipfile compression levels. Defaults to 1.</p> <p> DEFAULT: <code>1</code> </p> <p>The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files.</p> <p>The zip contains table.yml which provides an overview of the data:</p> <pre><code>--------------------------------------\n%YAML 1.2                              yaml version\ncolumns:                               start of columns section.\n    name: \u201c\u5217 1\u201d                       name of column 1.\n        pages: [p1b1, p1b2]            list of pages in column 1.\n    name: \u201c\u5217 2\u201d                       name of column 2\n        pages: [p2b1, p2b2]            list of pages in column 2.\n----------------------------------------\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def save(\n    self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n):  # USER FUNCTION.\n    \"\"\"saves table to compressed tpz file.\n\n    Args:\n        path (Path): file destination.\n        compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n        compression_level: See zipfile compression levels. Defaults to 1.\n        The default settings produce 80% compression at 10% slowdown.\n\n    The file format is as follows:\n    .tpz is a gzip archive with table metadata captured as table.yml\n    and the necessary set of pages saved as .npy files.\n\n    The zip contains table.yml which provides an overview of the data:\n    ```\n    --------------------------------------\n    %YAML 1.2                              yaml version\n    columns:                               start of columns section.\n        name: \u201c\u5217 1\u201d                       name of column 1.\n            pages: [p1b1, p1b2]            list of pages in column 1.\n        name: \u201c\u5217 2\u201d                       name of column 2\n            pages: [p2b1, p2b2]            list of pages in column 2.\n    ----------------------------------------\n    ```\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    type_check(path, Path)\n    if path.is_dir():\n        raise TypeError(f\"filename needed: {path}\")\n    if path.suffix != \".tpz\":\n        path = path.parent / (path.parts[-1] + \".tpz\")\n\n    # create yaml document\n    _page_counter = 0\n    d = {}\n    cols = {}\n    for name, col in self.columns.items():\n        type_check(col, Column)\n        cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n        _page_counter += len(col.pages)\n    d[\"columns\"] = cols\n    yml = yaml.safe_dump(\n        d, sort_keys=False, allow_unicode=True, default_flow_style=None\n    )\n\n    _file_counter = 0\n    with zipfile.ZipFile(\n        path, \"w\", compression=compression_method, compresslevel=compression_level\n    ) as f:\n        log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n        f.writestr(\"table.yml\", yml)\n        for name, col in self.columns.items():\n            for page in set(\n                col.pages\n            ):  # set of pages! remember t *= 1000 repeats t 1000x\n                with open(page.path, \"rb\", buffering=0) as raw_io:\n                    f.writestr(page.path.name, raw_io.read())\n                _file_counter += 1\n                log.debug(f\"adding Page {page.path}\")\n\n        _fields = len(self) * len(self.columns)\n        _avg = _fields // _page_counter\n        log.debug(\n            f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n        )\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.load","title":"<code>tablite.base.BaseTable.load(path, tqdm=_tqdm)</code>  <code>classmethod</code>","text":"<p>loads a table from .tpz file. See also Table.save for details on the file format.</p> PARAMETER  DESCRIPTION <code>path</code> <p>source file</p> <p> TYPE: <code>Path</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>table in read-only mode.</p> Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef load(cls, path, tqdm=_tqdm):  # USER FUNCTION.\n    \"\"\"loads a table from .tpz file.\n    See also Table.save for details on the file format.\n\n    Args:\n        path (Path): source file\n\n    Returns:\n        Table: table in read-only mode.\n    \"\"\"\n    path = Path(path)\n    log.debug(f\"loading {path}\")\n    with zipfile.ZipFile(path, \"r\") as f:\n        yml = f.read(\"table.yml\")\n        metadata = yaml.safe_load(yml)\n        t = cls()\n\n        page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n        with tqdm(\n            total=page_count,\n            desc=f\"loading '{path.name}' file\",\n            disable=Config.TQDM_DISABLE,\n        ) as pbar:\n            for name, d in metadata[\"columns\"].items():\n                column = Column(t.path)\n                for page in d[\"pages\"]:\n                    bytestream = io.BytesIO(f.read(page))\n                    data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n                    column.extend(data)\n                    pbar.update(1)\n                t.columns[name] = column\n    update_access_time(path)\n    return t\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.copy","title":"<code>tablite.base.BaseTable.copy()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def copy(self):\n    cls = type(self)\n    t = cls()\n    for name, column in self.columns.items():\n        new = Column(t.path)\n        new.pages = column.pages[:]\n        t.columns[name] = new\n    return t\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__imul__","title":"<code>tablite.base.BaseTable.__imul__(other)</code>","text":"<p>Repeats instance of table N times.</p> <p>Like list: <code>t = t * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __imul__(self, other):\n    \"\"\"Repeats instance of table N times.\n\n    Like list: `t = t * N`\n\n    Args:\n        other (int): multiplier\n    \"\"\"\n    if not (isinstance(other, int) and other &gt; 0):\n        raise TypeError(\n            f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    for col in self.columns.values():\n        col *= other\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__mul__","title":"<code>tablite.base.BaseTable.__mul__(other)</code>","text":"<p>Repeat table N times. Like list: <code>new = old * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __mul__(self, other):\n    \"\"\"Repeat table N times.\n    Like list: `new = old * N`\n\n    Args:\n        other (int): multiplier\n\n    Returns:\n        Table\n    \"\"\"\n    new = self.copy()\n    return new.__imul__(other)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__iadd__","title":"<code>tablite.base.BaseTable.__iadd__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_1 += table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <code>None</code> <p>self is updated.</p> Source code in <code>tablite/base.py</code> <pre><code>def __iadd__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_1 += table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        None: self is updated.\n    \"\"\"\n    type_check(other, BaseTable)\n    for name in self.columns.keys():\n        if name not in other.columns:\n            raise ValueError(f\"{name} not in other\")\n    for name in other.columns.keys():\n        if name not in self.columns:\n            raise ValueError(f\"{name} missing from self\")\n\n    for name, column in self.columns.items():\n        other_col = other.columns.get(name, None)\n        column.pages.extend(other_col.pages[:])\n    return self\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.__add__","title":"<code>tablite.base.BaseTable.__add__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_3 = table_1 + table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __add__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_3 = table_1 + table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        Table\n    \"\"\"\n    type_check(other, BaseTable)\n    cp = self.copy()\n    cp += other\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.add_rows","title":"<code>tablite.base.BaseTable.add_rows(*args, **kwargs)</code>","text":"<p>its more efficient to add many rows at once.</p> <p>if both args and kwargs, then args are added first, followed by kwargs.</p> <p>supported cases:</p> <pre><code>&gt;&gt;&gt; t = Table()\n&gt;&gt;&gt; t.add_columns('row','A','B','C')\n&gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n&gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n&gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n&gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n&gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n&gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n&gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n&gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n&gt;&gt;&gt; t.add_rows(\n    {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n    )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n&gt;&gt;&gt; t.add_rows( *[\n    {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n    ])                                                  # (10) list of dicts as args\n&gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def add_rows(self, *args, **kwargs):\n    \"\"\"its more efficient to add many rows at once.\n\n    if both args and kwargs, then args are added first, followed by kwargs.\n\n    supported cases:\n    ```\n    &gt;&gt;&gt; t = Table()\n    &gt;&gt;&gt; t.add_columns('row','A','B','C')\n    &gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n    &gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n    &gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n    &gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n    &gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n    &gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n    &gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n    &gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n    &gt;&gt;&gt; t.add_rows(\n        {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n        )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n    &gt;&gt;&gt; t.add_rows( *[\n        {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n        ])                                                  # (10) list of dicts as args\n    &gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n    ```\n\n    \"\"\"\n    if not BaseTable._add_row_slow_warning:\n        warnings.warn(\n            \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n        )\n        BaseTable._add_row_slow_warning = True\n\n    if args:\n        if not all(isinstance(i, (list, tuple, dict)) for i in args):  # 1,4\n            args = [args]\n\n        if all(isinstance(i, (list, tuple, dict)) for i in args):  # 2,3,7,8\n            # 1. turn the data into columns:\n\n            d = {n: [] for n in self.columns}\n            for arg in args:\n                if len(arg) != len(self.columns):\n                    raise ValueError(\n                        f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n                    )\n\n                if isinstance(arg, dict):\n                    for k, v in arg.items():  # 7,8\n                        d[k].append(v)\n\n                elif isinstance(arg, (list, tuple)):  # 2,3\n                    for n, v in zip(self.columns, arg):\n                        d[n].append(v)\n\n                else:\n                    raise TypeError(f\"{arg}?\")\n            # 2. extend the columns\n            for n, values in d.items():\n                col = self.columns[n]\n                col.extend(list_to_np_array(values))\n\n    if kwargs:\n        if isinstance(kwargs, dict):\n            if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(list_to_np_array(v))\n            else:\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(np.array([v]))\n        else:\n            raise ValueError(f\"format not recognised: {kwargs}\")\n\n    return\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.add_columns","title":"<code>tablite.base.BaseTable.add_columns(*names)</code>","text":"<p>Adds column names to table.</p> Source code in <code>tablite/base.py</code> <pre><code>def add_columns(self, *names):\n    \"\"\"Adds column names to table.\"\"\"\n    for name in names:\n        self.columns[name] = Column(self.path)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.add_column","title":"<code>tablite.base.BaseTable.add_column(name, data=None)</code>","text":"<p>verbose alias for table[name] = data, that checks if name already exists</p> PARAMETER  DESCRIPTION <code>name</code> <p>column name</p> <p> TYPE: <code>str</code> </p> <code>data</code> <p>values. Defaults to None.</p> <p> TYPE: <code>list,tuple)</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>name isn't string</p> <code>ValueError</code> <p>name already exists</p> Source code in <code>tablite/base.py</code> <pre><code>def add_column(self, name, data=None):\n    \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n    Args:\n        name (str): column name\n        data ((list,tuple), optional): values. Defaults to None.\n\n    Raises:\n        TypeError: name isn't string\n        ValueError: name already exists\n    \"\"\"\n    if not isinstance(name, str):\n        raise TypeError(\"expected name as string\")\n    if name in self.columns:\n        raise ValueError(f\"{name} already in {self.columns}\")\n    self.__setitem__(name, data)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.stack","title":"<code>tablite.base.BaseTable.stack(other)</code>","text":"<p>returns the joint stack of tables with overlapping column names. Example:</p> <pre><code>| Table A|  +  | Table B| = |  Table AB |\n| A| B| C|     | A| B| D|   | A| B| C| -|\n                            | A| B| -| D|\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def stack(self, other):\n    \"\"\"\n    returns the joint stack of tables with overlapping column names.\n    Example:\n    ```\n    | Table A|  +  | Table B| = |  Table AB |\n    | A| B| C|     | A| B| D|   | A| B| C| -|\n                                | A| B| -| D|\n    ```\n    \"\"\"\n    if not isinstance(other, BaseTable):\n        raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n    cp = self.copy()\n    for name, col2 in other.columns.items():\n        if name not in cp.columns:\n            cp[name] = [None] * len(self)\n        cp[name].pages.extend(col2.pages[:])\n\n    for name in self.columns:\n        if name not in other.columns:\n            if len(cp) &gt; 0:\n                cp[name].extend(np.array([None] * len(other)))\n    return cp\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.types","title":"<code>tablite.base.BaseTable.types()</code>","text":"<p>returns nested dict of data types in the form: <code>{column name: {python type class: number of instances }, ... }</code></p> <p>example:</p> <pre><code>&gt;&gt;&gt; t.types()\n{\n    'A': {&lt;class 'str'&gt;: 7},\n    'B': {&lt;class 'int'&gt;: 7}\n}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def types(self):\n    \"\"\"\n    returns nested dict of data types in the form:\n    `{column name: {python type class: number of instances }, ... }`\n\n    example:\n    ```\n    &gt;&gt;&gt; t.types()\n    {\n        'A': {&lt;class 'str'&gt;: 7},\n        'B': {&lt;class 'int'&gt;: 7}\n    }\n    ```\n    \"\"\"\n    d = {}\n    for name, col in self.columns.items():\n        assert isinstance(col, Column)\n        d[name] = col.types()\n    return d\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.display_dict","title":"<code>tablite.base.BaseTable.display_dict(slice_=None, blanks=None, dtype=False)</code>","text":"<p>helper for creating dict for display.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>python slice. Defaults to None.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>fill value for <code>None</code>. Defaults to None.</p> <p> TYPE: <code>optional</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>Adds datatype to each column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>slice_ must be None or slice.</p> RETURNS DESCRIPTION <code>dict</code> <p>from Table.</p> Source code in <code>tablite/base.py</code> <pre><code>def display_dict(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"helper for creating dict for display.\n\n    Args:\n        slice_ (slice, optional): python slice. Defaults to None.\n        blanks (optional): fill value for `None`. Defaults to None.\n        dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n    Raises:\n        TypeError: slice_ must be None or slice.\n\n    Returns:\n        dict: from Table.\n    \"\"\"\n    if not self.columns:\n        print(\"Empty Table\")\n        return\n\n    def datatype(col):  # PRIVATE\n        \"\"\"creates label for column datatype.\"\"\"\n        types = col.types()\n        if len(types) == 0:\n            typ = \"empty\"\n        elif len(types) == 1:\n            dt, _ = types.popitem()\n            typ = dt.__name__\n        else:\n            typ = \"mixed\"\n        return typ\n\n    row_count_tags = [\"#\", \"~\", \"*\"]\n    cols = set(self.columns)\n    for n, tag in product(range(1, 6), row_count_tags):\n        if n * tag not in cols:\n            tag = n * tag\n            break\n\n    if not isinstance(slice_, (slice, type(None))):\n        raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n    if isinstance(slice_, slice):\n        slc = slice_\n    if slice_ is None:\n        if len(self) &lt;= 20:\n            slc = slice(0, 20, 1)\n        else:\n            slc = None\n\n    n = len(self)\n    if slc:  # either we want slc or we want everything.\n        row_no = list(range(*slc.indices(len(self))))\n        data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n        for name, col in self.columns.items():\n            data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n                slc\n            ]\n    else:\n        data = {}\n        j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n        row_no = (\n            [f\"{i:,}\".rjust(j) for i in range(7)]\n            + [\"...\"]\n            + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n        )\n        data = {tag: row_no}\n\n        for name, col in self.columns.items():\n            if len(col) == n:\n                row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n            else:\n                empty = [blanks] * 7\n                head = (col[:7].tolist() + empty)[:7]\n                tail = (col[n - 7 :].tolist() + empty)[-7:]\n                row = head + [\"...\"] + tail\n            data[name] = row\n\n    if dtype:\n        for name, values in data.items():\n            if name in self.columns:\n                col = self.columns[name]\n                values.insert(0, datatype(col))\n            else:\n                values.insert(0, \"row\")\n\n    return data\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.to_ascii","title":"<code>tablite.base.BaseTable.to_ascii(slice_=None, blanks=None, dtype=False)</code>","text":"<p>returns ascii view of table as string.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def to_ascii(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"returns ascii view of table as string.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n\n    def adjust(v, length):  # PRIVATE FUNCTION\n        \"\"\"whitespace justifies field values based on datatype\"\"\"\n        if v is None:\n            return str(blanks).ljust(length)\n        elif isinstance(v, str):\n            return v.ljust(length)\n        else:\n            return str(v).rjust(length)\n\n    if not self.columns:\n        return str(self)\n\n    d = {}\n    for name, values in self.display_dict(\n        slice_=slice_, blanks=blanks, dtype=dtype\n    ).items():\n        as_text = [str(v) for v in values] + [str(name)]\n        width = max(len(i) for i in as_text)\n        new_name = name.center(width, \" \")\n        if dtype:\n            values[0] = values[0].center(width, \" \")\n        d[new_name] = [adjust(v, width) for v in values]\n\n    rows = dict_to_rows(d)\n    s = []\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n    s.append(\"|\" + \"|\".join(rows[0]) + \"|\")  # column names\n    start = 1\n    if dtype:\n        s.append(\"|\" + \"|\".join(rows[1]) + \"|\")  # datatypes\n        start = 2\n\n    s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n    for row in rows[start:]:\n        s.append(\"|\" + \"|\".join(row) + \"|\")\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n    if len(set(len(c) for c in self.columns.values())) != 1:\n        warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n        s.append(warning)\n\n    return \"\\n\".join(s)\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.show","title":"<code>tablite.base.BaseTable.show(slice_=None, blanks=None, dtype=False)</code>","text":"<p>prints ascii view of table.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def show(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"prints ascii view of table.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n    print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.to_dict","title":"<code>tablite.base.BaseTable.to_dict(columns=None, slice_=None)</code>","text":"<p>columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows.</p> <p>returns: dict with columns as keys and lists of values.</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 |  1|  3|\n| 1 |  2|  4|\n+===+===+===+\n&gt;&gt;&gt; t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def to_dict(self, columns=None, slice_=None):\n    \"\"\"\n    columns: list of column names. Default is None == all columns.\n    slice_: slice. Default is None == all rows.\n\n    returns: dict with columns as keys and lists of values.\n\n    Example:\n    ```\n    &gt;&gt;&gt; t.show()\n    +===+===+===+\n    | # | a | b |\n    |row|int|int|\n    +---+---+---+\n    | 0 |  1|  3|\n    | 1 |  2|  4|\n    +===+===+===+\n    &gt;&gt;&gt; t.to_dict()\n    {'a':[1,2], 'b':[3,4]}\n    ```\n\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n    assert isinstance(slice_, slice)\n\n    if columns is None:\n        columns = list(self.columns.keys())\n    if not isinstance(columns, list):\n        raise TypeError(\"expected columns as list of strings\")\n\n    return {name: list(self.columns[name][slice_]) for name in columns}\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.as_json_serializable","title":"<code>tablite.base.BaseTable.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None)</code>","text":"<p>provides a JSON compatible format of the table.</p> PARAMETER  DESCRIPTION <code>row_count</code> <p>Label for row counts. Defaults to \"row id\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'row id'</code> </p> <code>start_on</code> <p>row counts starts by default on 1.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1</code> </p> <code>columns</code> <p>Column names. Defaults to None which returns all columns.</p> <p> TYPE: <code>list of str</code> DEFAULT: <code>None</code> </p> <code>slice_</code> <p>selector. Defaults to None which returns [:]</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <p>JSON serializable dict: All python datatypes have been converted to JSON compliant data.</p> Source code in <code>tablite/base.py</code> <pre><code>def as_json_serializable(\n    self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n    \"\"\"provides a JSON compatible format of the table.\n\n    Args:\n        row_count (str, optional): Label for row counts. Defaults to \"row id\".\n        start_on (int, optional): row counts starts by default on 1.\n        columns (list of str, optional): Column names.\n            Defaults to None which returns all columns.\n        slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n    Returns:\n        JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n\n    assert isinstance(slice_, slice)\n    new = {\"columns\": {}, \"total_rows\": len(self)}\n    if row_count is not None:\n        new[\"columns\"][row_count] = [\n            i + start_on for i in range(*slice_.indices(len(self)))\n        ]\n\n    d = self.to_dict(columns, slice_=slice_)\n    for k, data in d.items():\n        new_k = unique_name(\n            k, new[\"columns\"]\n        )  # used to avoid overwriting the `row id` key.\n        new[\"columns\"][new_k] = [\n            DataTypes.to_json(v) for v in data\n        ]  # deal with non-json datatypes.\n    return new\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.index","title":"<code>tablite.base.BaseTable.index(*args)</code>","text":"<p>param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; table6 = Table()\n&gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n&gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A')  # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def index(self, *args):\n    \"\"\"\n    param: *args: column names\n    returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n    Examples:\n        ```\n        &gt;&gt;&gt; table6 = Table()\n        &gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n        &gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A')  # single key.\n        {('Alice',): [0],\n         ('Bob',): [1, 2],\n         ('Ben',): [3, 5],\n         ('Charlie',): [4],\n         ('Albert',): [6]})\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n        {('Alice', 'Alison'): [0],\n         ('Bob', 'Marley'): [1],\n         ('Bob', 'Dylan'): [2],\n         ('Ben', 'Affleck'): [3],\n         ('Charlie', 'Hepburn'): [4],\n         ('Ben', 'Barnes'): [5],\n         ('Albert', 'Einstein'): [6]})\n        ```\n\n    \"\"\"\n    idx = defaultdict(list)\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in enumerate(zip(*iterators)):\n        key = tuple(numpy_to_python(k) for k in key)\n        idx[key].append(ix)\n    return idx\n</code></pre>"},{"location":"reference/base/#tablite.base.BaseTable.unique_index","title":"<code>tablite.base.BaseTable.unique_index(*args, tqdm=_tqdm)</code>","text":"<p>generates the index of unique rows given a list of column names</p> PARAMETER  DESCRIPTION <code>*args</code> <p>columns names</p> <p> TYPE: <code>any</code> DEFAULT: <code>()</code> </p> <code>tqdm</code> <p>Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <p>np.array(int64): indices of unique records.</p> Source code in <code>tablite/base.py</code> <pre><code>def unique_index(self, *args, tqdm=_tqdm):\n    \"\"\"generates the index of unique rows given a list of column names\n\n    Args:\n        *args (any): columns names\n        tqdm (tqdm, optional): Defaults to _tqdm.\n\n    Returns:\n        np.array(int64): indices of unique records.\n    \"\"\"\n    if not args:\n        raise ValueError(\"*args (column names) is required\")\n    seen = set()\n    unique = set()\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n        key_hash = hash(tuple(numpy_to_python(k) for k in key))\n        if key_hash in seen:\n            continue\n        else:\n            seen.add(key_hash)\n            unique.add(ix)\n    return np.array(sorted(unique))\n</code></pre>"},{"location":"reference/base/#tablite.base-functions","title":"Functions","text":""},{"location":"reference/base/#tablite.base.register","title":"<code>tablite.base.register(path)</code>","text":"<p>registers path in file_registry</p> <p>The method is used by Table during init when the working directory path is set, so that python can clean all temporary files up at exit.</p> PARAMETER  DESCRIPTION <code>path</code> <p>typically tmp/tablite-tmp/PID-{os.getpid()}</p> <p> TYPE: <code>Path</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def register(path):\n    \"\"\"registers path in file_registry\n\n    The method is used by Table during init when the working directory path\n    is set, so that python can clean all temporary files up at exit.\n\n    Args:\n        path (Path): typically tmp/tablite-tmp/PID-{os.getpid()}\n    \"\"\"\n    global file_registry\n    file_registry.add(path)\n</code></pre>"},{"location":"reference/base/#tablite.base.shutdown","title":"<code>tablite.base.shutdown()</code>","text":"<p>method to clean up temporary files triggered at shutdown.</p> Source code in <code>tablite/base.py</code> <pre><code>def shutdown():\n    \"\"\"method to clean up temporary files triggered at shutdown.\"\"\"\n    for path in file_registry:\n        if Config.pid in str(path):  # safety feature to prevent rm -rf /\n            log.debug(f\"shutdown: running rmtree({path})\")\n            shutil.rmtree(path)\n</code></pre>"},{"location":"reference/config/","title":"Config","text":""},{"location":"reference/config/#tablite.config","title":"<code>tablite.config</code>","text":""},{"location":"reference/config/#tablite.config-classes","title":"Classes","text":""},{"location":"reference/config/#tablite.config.Config","title":"<code>tablite.config.Config</code>","text":"<p>             Bases: <code>object</code></p> <p>Config class for Tablite Tables.</p> <p>The default location for the storage is loaded as</p> <pre><code>Config.workdir = pathlib.Path(os.environ.get(\"TABLITE_TMPDIR\", f\"{tempfile.gettempdir()}/tablite-tmp\"))\n</code></pre> <p>to overwrite, first import the config class, then set the new workdir.</p> <pre><code>&gt;&gt;&gt; from tablite import config\n&gt;&gt;&gt; from pathlib import Path\n&gt;&gt;&gt; config.workdir = Path(\"/this/new/location\")\n</code></pre> <p>the new path will now be used for every new table.</p> <p>PAGE_SIZE = 1_000_000 sets the page size limit.</p> <p>Multiprocessing is enabled in one of three modes: AUTO = \"auto\" FALSE = \"sp\" FORCE = \"mp\"</p> <p>MULTIPROCESSING_MODE = AUTO  is default.</p> <p>SINGLE_PROCESSING_LIMIT = 1_000_000 when the number of fields (rows x columns) exceed this value, multiprocessing is used.</p>"},{"location":"reference/config/#tablite.config.Config-attributes","title":"Attributes","text":""},{"location":"reference/config/#tablite.config.Config.USE_NIMPORTER","title":"<code>tablite.config.Config.USE_NIMPORTER = os.environ.get('USE_NIMPORTER', 'true').lower() in ['1', 't', 'true', 'y', 'yes']</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH","title":"<code>tablite.config.Config.ALLOW_CSV_READER_FALLTHROUGH = os.environ.get('ALLOW_CSV_READER_FALLTHROUGH', 'true').lower() in ['1', 't', 'true', 'y', 'yes']</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.NIM_SUPPORTED_CONV_TYPES","title":"<code>tablite.config.Config.NIM_SUPPORTED_CONV_TYPES = ['Windows-1252', 'ISO-8859-1']</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.workdir","title":"<code>tablite.config.Config.workdir = pathlib.Path(os.environ.get('TABLITE_TMPDIR', f'{tempfile.gettempdir()}/tablite-tmp'))</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.pid","title":"<code>tablite.config.Config.pid = f'pid-{os.getpid()}'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.PAGE_SIZE","title":"<code>tablite.config.Config.PAGE_SIZE = 1000000</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.ENCODING","title":"<code>tablite.config.Config.ENCODING = 'UTF-8'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.DISK_LIMIT","title":"<code>tablite.config.Config.DISK_LIMIT = int(10000000000.0)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":"<p>10e9 (10Gb) on 100 Gb disk means raise at 90 Gb disk usage. if DISK_LIMIT &lt;= 0, the check is turned off.</p>"},{"location":"reference/config/#tablite.config.Config.SINGLE_PROCESSING_LIMIT","title":"<code>tablite.config.Config.SINGLE_PROCESSING_LIMIT = 1000000</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":"<p>when the number of fields (rows x columns) exceed this value, multiprocessing is used.</p>"},{"location":"reference/config/#tablite.config.Config.vpus","title":"<code>tablite.config.Config.vpus = max(os.cpu_count() - 1, 1)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.AUTO","title":"<code>tablite.config.Config.AUTO = 'auto'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.FALSE","title":"<code>tablite.config.Config.FALSE = 'sp'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.FORCE","title":"<code>tablite.config.Config.FORCE = 'mp'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.MULTIPROCESSING_MODE","title":"<code>tablite.config.Config.MULTIPROCESSING_MODE = AUTO</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config.TQDM_DISABLE","title":"<code>tablite.config.Config.TQDM_DISABLE = False</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/config/#tablite.config.Config-functions","title":"Functions","text":""},{"location":"reference/config/#tablite.config.Config.reset","title":"<code>tablite.config.Config.reset()</code>  <code>classmethod</code>","text":"<p>Resets the config class to original values.</p> Source code in <code>tablite/config.py</code> <pre><code>@classmethod\ndef reset(cls):\n    \"\"\"Resets the config class to original values.\"\"\"\n    for k, v in _default_values.items():\n        setattr(Config, k, v)\n</code></pre>"},{"location":"reference/config/#tablite.config.Config.page_steps","title":"<code>tablite.config.Config.page_steps(length)</code>  <code>classmethod</code>","text":"<p>an iterator that yield start and end in page sizes</p> YIELDS DESCRIPTION <code>tuple</code> <p>start:int, end:int</p> Source code in <code>tablite/config.py</code> <pre><code>@classmethod\ndef page_steps(cls, length):\n    \"\"\"an iterator that yield start and end in page sizes\n\n    Yields:\n        tuple: start:int, end:int\n    \"\"\"\n    start, end = 0, 0\n    for _ in range(0, length + 1, cls.PAGE_SIZE):\n        start, end = end, min(end + cls.PAGE_SIZE, length)\n        yield start, end\n        if end == length:\n            return\n</code></pre>"},{"location":"reference/core/","title":"Core","text":""},{"location":"reference/core/#tablite.core","title":"<code>tablite.core</code>","text":""},{"location":"reference/core/#tablite.core-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.log","title":"<code>tablite.core.log = logging.getLogger(__name__)</code>  <code>module-attribute</code>","text":""},{"location":"reference/core/#tablite.core-classes","title":"Classes","text":""},{"location":"reference/core/#tablite.core.Table","title":"<code>tablite.core.Table(columns=None, headers=None, rows=None, _path=None)</code>","text":"<p>             Bases: <code>BaseTable</code></p> <p>creates Table</p> PARAMETER  DESCRIPTION <code>EITHER</code> <p>columns (dict, optional): dict with column names as keys, values as lists. Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})</p> <p> </p> Source code in <code>tablite/core.py</code> <pre><code>def __init__(self, columns=None, headers=None, rows=None, _path=None) -&gt; None:\n    \"\"\"creates Table\n\n    Args:\n        EITHER:\n            columns (dict, optional): dict with column names as keys, values as lists.\n            Example: t = Table(columns={\"a\": [1, 2], \"b\": [3, 4]})\n        OR\n            headers (list of strings, optional): list of column names.\n            rows (list of tuples or lists, optional): values for columns\n            Example: t = Table(headers=[\"a\", \"b\"], rows=[[1,3], [2,4]])\n    \"\"\"\n    super().__init__(columns, headers, rows, _path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table-attributes","title":"Attributes","text":""},{"location":"reference/core/#tablite.core.Table.path","title":"<code>tablite.core.Table.path = _path</code>  <code>instance-attribute</code>","text":""},{"location":"reference/core/#tablite.core.Table.columns","title":"<code>tablite.core.Table.columns = {}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/core/#tablite.core.Table.rows","title":"<code>tablite.core.Table.rows</code>  <code>property</code>","text":"<p>enables row based iteration in python types.</p> <p>Example:</p> <pre><code>for row in Table.rows:\n    print(row)\n</code></pre> <p>Yields:     tuple: values is same order as columns.</p>"},{"location":"reference/core/#tablite.core.Table-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core.Table.__str__","title":"<code>tablite.core.Table.__str__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __str__(self):  # USER FUNCTION.\n    return f\"{self.__class__.__name__}({len(self.columns):,} columns, {len(self):,} rows)\"\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__repr__","title":"<code>tablite.core.Table.__repr__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __repr__(self):\n    return self.__str__()\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.nbytes","title":"<code>tablite.core.Table.nbytes()</code>","text":"<p>finds the total bytes of the table on disk</p> RETURNS DESCRIPTION <code>tuple</code> <p>int: real bytes used on disk int: total bytes used if flattened</p> Source code in <code>tablite/base.py</code> <pre><code>def nbytes(self):  # USER FUNCTION.\n    \"\"\"finds the total bytes of the table on disk\n\n    Returns:\n        tuple:\n            int: real bytes used on disk\n            int: total bytes used if flattened\n    \"\"\"\n    real = {}\n    total = 0\n    for column in self.columns.values():\n        for page in set(column.pages):\n            real[page] = page.path.stat().st_size\n        for page in column.pages:\n            total += real[page]\n    return sum(real.values()), total\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.items","title":"<code>tablite.core.Table.items()</code>","text":"<p>returns table as dict</p> RETURNS DESCRIPTION <code>dict</code> <p>Table as dict <code>{column_name: [values], ...}</code></p> Source code in <code>tablite/base.py</code> <pre><code>def items(self):  # USER FUNCTION.\n    \"\"\"returns table as dict\n\n    Returns:\n        dict: Table as dict `{column_name: [values], ...}`\n    \"\"\"\n    return {\n        name: column[:].tolist() for name, column in self.columns.items()\n    }.items()\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__delitem__","title":"<code>tablite.core.Table.__delitem__(key)</code>","text":"<p>Examples:</p> <pre><code>&gt;&gt;&gt; del table['a']  # removes column 'a'\n&gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def __delitem__(self, key):  # USER FUNCTION.\n    \"\"\"\n    Examples:\n    ```\n    &gt;&gt;&gt; del table['a']  # removes column 'a'\n    &gt;&gt;&gt; del table[-3:]  # removes last 3 rows from all columns.\n    ```\n    \"\"\"\n    if isinstance(key, (int, slice)):\n        for column in self.columns.values():\n            del column[key]\n    elif key in self.columns:\n        del self.columns[key]\n    else:\n        raise KeyError(f\"Key not found: {key}\")\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__setitem__","title":"<code>tablite.core.Table.__setitem__(key, value)</code>","text":"<p>table behaves like a dict. Args:     key (str or hashable): column name     value (iterable): list, tuple or nd.array with values.</p> <p>As Table now accepts the keyword <code>columns</code> as a dict:</p> <pre><code>&gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n</code></pre> <p>and the header/data combinations:</p> <pre><code>&gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n</code></pre> <p>This has the side-benefit that tuples now can be used as headers.</p> Source code in <code>tablite/base.py</code> <pre><code>def __setitem__(self, key, value):  # USER FUNCTION\n    \"\"\"table behaves like a dict.\n    Args:\n        key (str or hashable): column name\n        value (iterable): list, tuple or nd.array with values.\n\n    As Table now accepts the keyword `columns` as a dict:\n    ```\n    &gt;&gt;&gt; t = Table(columns={'b':[4,5,6], 'c':[7,8,9]})\n    ```\n    and the header/data combinations:\n    ```\n    &gt;&gt;&gt; t = Table(header=['b','c'], data=[[4,5,6],[7,8,9]])\n    ```\n    This has the side-benefit that tuples now can be used as headers.\n    \"\"\"\n    if value is None:\n        self.columns[key] = Column(self.path, value=None)\n    elif isinstance(value, (list, tuple)):\n        value = list_to_np_array(value)\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, (np.ndarray)):\n        self.columns[key] = Column(self.path, value)\n    elif isinstance(value, Column):\n        self.columns[key] = value\n    else:\n        raise TypeError(f\"{type(value)} not supported.\")\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__getitem__","title":"<code>tablite.core.Table.__getitem__(keys)</code>","text":"<p>Enables selection of columns and rows</p> PARAMETER  DESCRIPTION <code>keys</code> <p> TYPE: <code>column name, integer or slice</code> </p> <code>Examples</code> <p> </p> <code>&gt;&gt;&gt;</code> <p>10]                        selects first 10 rows from all columns</p> <p> TYPE: <code>table[</code> </p> <code>&gt;&gt;&gt;</code> <p>20:3] selects column 'b' and 'c' and 'a' twice for a slice.</p> <p> TYPE: <code>table['b', 'a', 'a', 'c', 2</code> </p> <p>Raises:     KeyError: if key is not found.     TypeError: if key is not a string, integer or slice.</p> RETURNS DESCRIPTION <code>Table</code> <p>returns columns in same order as selection.</p> Source code in <code>tablite/base.py</code> <pre><code>def __getitem__(self, keys):  # USER FUNCTION\n    \"\"\"\n    Enables selection of columns and rows\n\n    Args:\n        keys (column name, integer or slice):\n        Examples:\n        ```\n        &gt;&gt;&gt; table['a']                        selects column 'a'\n        &gt;&gt;&gt; table[3]                          selects row 3 as a tuple.\n        &gt;&gt;&gt; table[:10]                        selects first 10 rows from all columns\n        &gt;&gt;&gt; table['a','b', slice(3,20,2)]     selects a slice from columns 'a' and 'b'\n        &gt;&gt;&gt; table['b', 'a', 'a', 'c', 2:20:3] selects column 'b' and 'c' and 'a' twice for a slice.\n        &gt;&gt;&gt; table[('b', 'a', 'a', 'c')]       selects columns 'b', 'a', 'a', and 'c' using a tuple.\n        ```\n    Raises:\n        KeyError: if key is not found.\n        TypeError: if key is not a string, integer or slice.\n\n    Returns:\n        Table: returns columns in same order as selection.\n    \"\"\"\n\n    if not isinstance(keys, tuple):\n        if isinstance(keys, list):\n            keys = tuple(keys)\n        else:\n            keys = (keys,)\n    if isinstance(keys[0], tuple):\n        keys = tuple(list(chain(*keys)))\n\n    integers = [i for i in keys if isinstance(i, int)]\n    if len(integers) == len(keys) == 1:  # return a single tuple.\n        keys = [slice(keys[0])]\n\n    column_names = [i for i in keys if isinstance(i, str)]\n    column_names = list(self.columns) if not column_names else column_names\n    not_found = [name for name in column_names if name not in self.columns]\n    if not_found:\n        raise KeyError(f\"keys not found: {', '.join(not_found)}\")\n\n    slices = [i for i in keys if isinstance(i, slice)]\n    slc = slice(0, len(self)) if not slices else slices[0]\n\n    if (\n        len(slices) == 0 and len(column_names) == 1\n    ):  # e.g. tbl['a'] or tbl['a'][:10]\n        col = self.columns[column_names[0]]\n        if slices:\n            return col[slc]  # return slice from column as list of values\n        else:\n            return col  # return whole column\n\n    elif len(integers) == 1:  # return a single tuple.\n        row_no = integers[0]\n        slc = slice(row_no, row_no + 1)\n        return tuple(self.columns[name][slc].tolist()[0] for name in column_names)\n\n    elif not slices:  # e.g. new table with N whole columns.\n        return self.__class__(\n            columns={name: self.columns[name] for name in column_names}\n        )\n\n    else:  # e.g. new table from selection of columns and slices.\n        t = self.__class__()\n        for name in column_names:\n            column = self.columns[name]\n\n            new_column = Column(t.path)  # create new Column.\n            for item in column.getpages(slc):\n                if isinstance(item, np.ndarray):\n                    new_column.extend(item)  # extend subslice (expensive)\n                elif isinstance(item, SimplePage):\n                    new_column.pages.append(item)  # extend page (cheap)\n                else:\n                    raise TypeError(f\"Bad item: {item}\")\n\n            # below:\n            # set the new column directly on t.columns.\n            # Do not use t[name] as that triggers __setitem__ again.\n            t.columns[name] = new_column\n\n        return t\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__len__","title":"<code>tablite.core.Table.__len__()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def __len__(self):  # USER FUNCTION.\n    if not self.columns:\n        return 0\n    return max(len(c) for c in self.columns.values())\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__eq__","title":"<code>tablite.core.Table.__eq__(other) -&gt; bool</code>","text":"<p>Determines if two tables have identical content.</p> PARAMETER  DESCRIPTION <code>other</code> <p>table for comparison</p> <p> TYPE: <code>Table</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>True if tables are identical.</p> <p> TYPE: <code>bool</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __eq__(self, other) -&gt; bool:  # USER FUNCTION.\n    \"\"\"Determines if two tables have identical content.\n\n    Args:\n        other (Table): table for comparison\n\n    Returns:\n        bool: True if tables are identical.\n    \"\"\"\n    if isinstance(other, dict):\n        return self.items() == other.items()\n    if not isinstance(other, BaseTable):\n        return False\n    if id(self) == id(other):\n        return True\n    if len(self) != len(other):\n        return False\n    if len(self) == len(other) == 0:\n        return True\n    if self.columns.keys() != other.columns.keys():\n        return False\n    for name, col in self.columns.items():\n        if not (col == other.columns[name]):\n            return False\n    return True\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.clear","title":"<code>tablite.core.Table.clear()</code>","text":"<p>clears the table. Like dict().clear()</p> Source code in <code>tablite/base.py</code> <pre><code>def clear(self):  # USER FUNCTION.\n    \"\"\"clears the table. Like dict().clear()\"\"\"\n    self.columns.clear()\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.save","title":"<code>tablite.core.Table.save(path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1)</code>","text":"<p>saves table to compressed tpz file.</p> PARAMETER  DESCRIPTION <code>path</code> <p>file destination.</p> <p> TYPE: <code>Path</code> </p> <code>compression_method</code> <p>See zipfile compression methods. Defaults to ZIP_DEFLATED.</p> <p> DEFAULT: <code>ZIP_DEFLATED</code> </p> <code>compression_level</code> <p>See zipfile compression levels. Defaults to 1.</p> <p> DEFAULT: <code>1</code> </p> <p>The file format is as follows: .tpz is a gzip archive with table metadata captured as table.yml and the necessary set of pages saved as .npy files.</p> <p>The zip contains table.yml which provides an overview of the data:</p> <pre><code>--------------------------------------\n%YAML 1.2                              yaml version\ncolumns:                               start of columns section.\n    name: \u201c\u5217 1\u201d                       name of column 1.\n        pages: [p1b1, p1b2]            list of pages in column 1.\n    name: \u201c\u5217 2\u201d                       name of column 2\n        pages: [p2b1, p2b2]            list of pages in column 2.\n----------------------------------------\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def save(\n    self, path, compression_method=zipfile.ZIP_DEFLATED, compression_level=1\n):  # USER FUNCTION.\n    \"\"\"saves table to compressed tpz file.\n\n    Args:\n        path (Path): file destination.\n        compression_method: See zipfile compression methods. Defaults to ZIP_DEFLATED.\n        compression_level: See zipfile compression levels. Defaults to 1.\n        The default settings produce 80% compression at 10% slowdown.\n\n    The file format is as follows:\n    .tpz is a gzip archive with table metadata captured as table.yml\n    and the necessary set of pages saved as .npy files.\n\n    The zip contains table.yml which provides an overview of the data:\n    ```\n    --------------------------------------\n    %YAML 1.2                              yaml version\n    columns:                               start of columns section.\n        name: \u201c\u5217 1\u201d                       name of column 1.\n            pages: [p1b1, p1b2]            list of pages in column 1.\n        name: \u201c\u5217 2\u201d                       name of column 2\n            pages: [p2b1, p2b2]            list of pages in column 2.\n    ----------------------------------------\n    ```\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    type_check(path, Path)\n    if path.is_dir():\n        raise TypeError(f\"filename needed: {path}\")\n    if path.suffix != \".tpz\":\n        path = path.parent / (path.parts[-1] + \".tpz\")\n\n    # create yaml document\n    _page_counter = 0\n    d = {}\n    cols = {}\n    for name, col in self.columns.items():\n        type_check(col, Column)\n        cols[name] = {\"pages\": [p.path.name for p in col.pages]}\n        _page_counter += len(col.pages)\n    d[\"columns\"] = cols\n    yml = yaml.safe_dump(\n        d, sort_keys=False, allow_unicode=True, default_flow_style=None\n    )\n\n    _file_counter = 0\n    with zipfile.ZipFile(\n        path, \"w\", compression=compression_method, compresslevel=compression_level\n    ) as f:\n        log.debug(f\"writing .tpz to {path} with\\n{yml}\")\n        f.writestr(\"table.yml\", yml)\n        for name, col in self.columns.items():\n            for page in set(\n                col.pages\n            ):  # set of pages! remember t *= 1000 repeats t 1000x\n                with open(page.path, \"rb\", buffering=0) as raw_io:\n                    f.writestr(page.path.name, raw_io.read())\n                _file_counter += 1\n                log.debug(f\"adding Page {page.path}\")\n\n        _fields = len(self) * len(self.columns)\n        _avg = _fields // _page_counter\n        log.debug(\n            f\"Wrote {_fields:,} on {_page_counter:,} pages in {_file_counter} files: {_avg} fields/page\"\n        )\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.load","title":"<code>tablite.core.Table.load(path, tqdm=_tqdm)</code>  <code>classmethod</code>","text":"<p>loads a table from .tpz file. See also Table.save for details on the file format.</p> PARAMETER  DESCRIPTION <code>path</code> <p>source file</p> <p> TYPE: <code>Path</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>table in read-only mode.</p> Source code in <code>tablite/base.py</code> <pre><code>@classmethod\ndef load(cls, path, tqdm=_tqdm):  # USER FUNCTION.\n    \"\"\"loads a table from .tpz file.\n    See also Table.save for details on the file format.\n\n    Args:\n        path (Path): source file\n\n    Returns:\n        Table: table in read-only mode.\n    \"\"\"\n    path = Path(path)\n    log.debug(f\"loading {path}\")\n    with zipfile.ZipFile(path, \"r\") as f:\n        yml = f.read(\"table.yml\")\n        metadata = yaml.safe_load(yml)\n        t = cls()\n\n        page_count = sum([len(c[\"pages\"]) for c in metadata[\"columns\"].values()])\n\n        with tqdm(\n            total=page_count,\n            desc=f\"loading '{path.name}' file\",\n            disable=Config.TQDM_DISABLE,\n        ) as pbar:\n            for name, d in metadata[\"columns\"].items():\n                column = Column(t.path)\n                for page in d[\"pages\"]:\n                    bytestream = io.BytesIO(f.read(page))\n                    data = np.load(bytestream, allow_pickle=True, fix_imports=False)\n                    column.extend(data)\n                    pbar.update(1)\n                t.columns[name] = column\n    update_access_time(path)\n    return t\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.copy","title":"<code>tablite.core.Table.copy()</code>","text":"Source code in <code>tablite/base.py</code> <pre><code>def copy(self):\n    cls = type(self)\n    t = cls()\n    for name, column in self.columns.items():\n        new = Column(t.path)\n        new.pages = column.pages[:]\n        t.columns[name] = new\n    return t\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__imul__","title":"<code>tablite.core.Table.__imul__(other)</code>","text":"<p>Repeats instance of table N times.</p> <p>Like list: <code>t = t * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def __imul__(self, other):\n    \"\"\"Repeats instance of table N times.\n\n    Like list: `t = t * N`\n\n    Args:\n        other (int): multiplier\n    \"\"\"\n    if not (isinstance(other, int) and other &gt; 0):\n        raise TypeError(\n            f\"a table can be repeated an integer number of times, not {type(other)} number of times\"\n        )\n    for col in self.columns.values():\n        col *= other\n    return self\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__mul__","title":"<code>tablite.core.Table.__mul__(other)</code>","text":"<p>Repeat table N times. Like list: <code>new = old * N</code></p> PARAMETER  DESCRIPTION <code>other</code> <p>multiplier</p> <p> TYPE: <code>int</code> </p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __mul__(self, other):\n    \"\"\"Repeat table N times.\n    Like list: `new = old * N`\n\n    Args:\n        other (int): multiplier\n\n    Returns:\n        Table\n    \"\"\"\n    new = self.copy()\n    return new.__imul__(other)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__iadd__","title":"<code>tablite.core.Table.__iadd__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_1 += table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <code>None</code> <p>self is updated.</p> Source code in <code>tablite/base.py</code> <pre><code>def __iadd__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_1 += table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        None: self is updated.\n    \"\"\"\n    type_check(other, BaseTable)\n    for name in self.columns.keys():\n        if name not in other.columns:\n            raise ValueError(f\"{name} not in other\")\n    for name in other.columns.keys():\n        if name not in self.columns:\n            raise ValueError(f\"{name} missing from self\")\n\n    for name, column in self.columns.items():\n        other_col = other.columns.get(name, None)\n        column.pages.extend(other_col.pages[:])\n    return self\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.__add__","title":"<code>tablite.core.Table.__add__(other)</code>","text":"<p>Concatenates tables with same column names.</p> <p>Like list: <code>table_3 = table_1 + table_2</code></p> RAISES DESCRIPTION <code>ValueError</code> <p>If column names don't match.</p> RETURNS DESCRIPTION <p>Table</p> Source code in <code>tablite/base.py</code> <pre><code>def __add__(self, other):\n    \"\"\"Concatenates tables with same column names.\n\n    Like list: `table_3 = table_1 + table_2`\n\n    Args:\n        other (Table)\n\n    Raises:\n        ValueError: If column names don't match.\n\n    Returns:\n        Table\n    \"\"\"\n    type_check(other, BaseTable)\n    cp = self.copy()\n    cp += other\n    return cp\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.add_rows","title":"<code>tablite.core.Table.add_rows(*args, **kwargs)</code>","text":"<p>its more efficient to add many rows at once.</p> <p>if both args and kwargs, then args are added first, followed by kwargs.</p> <p>supported cases:</p> <pre><code>&gt;&gt;&gt; t = Table()\n&gt;&gt;&gt; t.add_columns('row','A','B','C')\n&gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n&gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n&gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n&gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n&gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n&gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n&gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n&gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n&gt;&gt;&gt; t.add_rows(\n    {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n    )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n&gt;&gt;&gt; t.add_rows( *[\n    {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n    {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n    ])                                                  # (10) list of dicts as args\n&gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def add_rows(self, *args, **kwargs):\n    \"\"\"its more efficient to add many rows at once.\n\n    if both args and kwargs, then args are added first, followed by kwargs.\n\n    supported cases:\n    ```\n    &gt;&gt;&gt; t = Table()\n    &gt;&gt;&gt; t.add_columns('row','A','B','C')\n    &gt;&gt;&gt; t.add_rows(1, 1, 2, 3)                              # (1) individual values as args\n    &gt;&gt;&gt; t.add_rows([2, 1, 2, 3])                            # (2) list of values as args\n    &gt;&gt;&gt; t.add_rows((3, 1, 2, 3))                            # (3) tuple of values as args\n    &gt;&gt;&gt; t.add_rows(*(4, 1, 2, 3))                           # (4) unpacked tuple becomes arg like (1)\n    &gt;&gt;&gt; t.add_rows(row=5, A=1, B=2, C=3)                    # (5) kwargs\n    &gt;&gt;&gt; t.add_rows(**{'row': 6, 'A': 1, 'B': 2, 'C': 3})    # (6) dict / json interpreted a kwargs\n    &gt;&gt;&gt; t.add_rows((7, 1, 2, 3), (8, 4, 5, 6))              # (7) two (or more) tuples as args\n    &gt;&gt;&gt; t.add_rows([9, 1, 2, 3], [10, 4, 5, 6])             # (8) two or more lists as rgs\n    &gt;&gt;&gt; t.add_rows(\n        {'row': 11, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 12, 'A': 4, 'B': 5, 'C': 6}\n        )                                                   # (9) two (or more) dicts as args - roughly comma sep'd json.\n    &gt;&gt;&gt; t.add_rows( *[\n        {'row': 13, 'A': 1, 'B': 2, 'C': 3},\n        {'row': 14, 'A': 1, 'B': 2, 'C': 3}\n        ])                                                  # (10) list of dicts as args\n    &gt;&gt;&gt; t.add_rows(row=[15,16], A=[1,1], B=[2,2], C=[3,3])  # (11) kwargs with lists as values\n    ```\n\n    \"\"\"\n    if not BaseTable._add_row_slow_warning:\n        warnings.warn(\n            \"add_rows is slow. Consider using add_columns and then assigning values to the columns directly.\"\n        )\n        BaseTable._add_row_slow_warning = True\n\n    if args:\n        if not all(isinstance(i, (list, tuple, dict)) for i in args):  # 1,4\n            args = [args]\n\n        if all(isinstance(i, (list, tuple, dict)) for i in args):  # 2,3,7,8\n            # 1. turn the data into columns:\n\n            d = {n: [] for n in self.columns}\n            for arg in args:\n                if len(arg) != len(self.columns):\n                    raise ValueError(\n                        f\"len({arg})== {len(arg)}, but there are {len(self.columns)} columns\"\n                    )\n\n                if isinstance(arg, dict):\n                    for k, v in arg.items():  # 7,8\n                        d[k].append(v)\n\n                elif isinstance(arg, (list, tuple)):  # 2,3\n                    for n, v in zip(self.columns, arg):\n                        d[n].append(v)\n\n                else:\n                    raise TypeError(f\"{arg}?\")\n            # 2. extend the columns\n            for n, values in d.items():\n                col = self.columns[n]\n                col.extend(list_to_np_array(values))\n\n    if kwargs:\n        if isinstance(kwargs, dict):\n            if all(isinstance(v, (list, tuple)) for v in kwargs.values()):\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(list_to_np_array(v))\n            else:\n                for k, v in kwargs.items():\n                    col = self.columns[k]\n                    col.extend(np.array([v]))\n        else:\n            raise ValueError(f\"format not recognised: {kwargs}\")\n\n    return\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.add_columns","title":"<code>tablite.core.Table.add_columns(*names)</code>","text":"<p>Adds column names to table.</p> Source code in <code>tablite/base.py</code> <pre><code>def add_columns(self, *names):\n    \"\"\"Adds column names to table.\"\"\"\n    for name in names:\n        self.columns[name] = Column(self.path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.add_column","title":"<code>tablite.core.Table.add_column(name, data=None)</code>","text":"<p>verbose alias for table[name] = data, that checks if name already exists</p> PARAMETER  DESCRIPTION <code>name</code> <p>column name</p> <p> TYPE: <code>str</code> </p> <code>data</code> <p>values. Defaults to None.</p> <p> TYPE: <code>list,tuple)</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>name isn't string</p> <code>ValueError</code> <p>name already exists</p> Source code in <code>tablite/base.py</code> <pre><code>def add_column(self, name, data=None):\n    \"\"\"verbose alias for table[name] = data, that checks if name already exists\n\n    Args:\n        name (str): column name\n        data ((list,tuple), optional): values. Defaults to None.\n\n    Raises:\n        TypeError: name isn't string\n        ValueError: name already exists\n    \"\"\"\n    if not isinstance(name, str):\n        raise TypeError(\"expected name as string\")\n    if name in self.columns:\n        raise ValueError(f\"{name} already in {self.columns}\")\n    self.__setitem__(name, data)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.stack","title":"<code>tablite.core.Table.stack(other)</code>","text":"<p>returns the joint stack of tables with overlapping column names. Example:</p> <pre><code>| Table A|  +  | Table B| = |  Table AB |\n| A| B| C|     | A| B| D|   | A| B| C| -|\n                            | A| B| -| D|\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def stack(self, other):\n    \"\"\"\n    returns the joint stack of tables with overlapping column names.\n    Example:\n    ```\n    | Table A|  +  | Table B| = |  Table AB |\n    | A| B| C|     | A| B| D|   | A| B| C| -|\n                                | A| B| -| D|\n    ```\n    \"\"\"\n    if not isinstance(other, BaseTable):\n        raise TypeError(f\"stack only works for Table, not {type(other)}\")\n\n    cp = self.copy()\n    for name, col2 in other.columns.items():\n        if name not in cp.columns:\n            cp[name] = [None] * len(self)\n        cp[name].pages.extend(col2.pages[:])\n\n    for name in self.columns:\n        if name not in other.columns:\n            if len(cp) &gt; 0:\n                cp[name].extend(np.array([None] * len(other)))\n    return cp\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.types","title":"<code>tablite.core.Table.types()</code>","text":"<p>returns nested dict of data types in the form: <code>{column name: {python type class: number of instances }, ... }</code></p> <p>example:</p> <pre><code>&gt;&gt;&gt; t.types()\n{\n    'A': {&lt;class 'str'&gt;: 7},\n    'B': {&lt;class 'int'&gt;: 7}\n}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def types(self):\n    \"\"\"\n    returns nested dict of data types in the form:\n    `{column name: {python type class: number of instances }, ... }`\n\n    example:\n    ```\n    &gt;&gt;&gt; t.types()\n    {\n        'A': {&lt;class 'str'&gt;: 7},\n        'B': {&lt;class 'int'&gt;: 7}\n    }\n    ```\n    \"\"\"\n    d = {}\n    for name, col in self.columns.items():\n        assert isinstance(col, Column)\n        d[name] = col.types()\n    return d\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.display_dict","title":"<code>tablite.core.Table.display_dict(slice_=None, blanks=None, dtype=False)</code>","text":"<p>helper for creating dict for display.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>python slice. Defaults to None.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>fill value for <code>None</code>. Defaults to None.</p> <p> TYPE: <code>optional</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>Adds datatype to each column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> RAISES DESCRIPTION <code>TypeError</code> <p>slice_ must be None or slice.</p> RETURNS DESCRIPTION <code>dict</code> <p>from Table.</p> Source code in <code>tablite/base.py</code> <pre><code>def display_dict(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"helper for creating dict for display.\n\n    Args:\n        slice_ (slice, optional): python slice. Defaults to None.\n        blanks (optional): fill value for `None`. Defaults to None.\n        dtype (bool, optional): Adds datatype to each column. Defaults to False.\n\n    Raises:\n        TypeError: slice_ must be None or slice.\n\n    Returns:\n        dict: from Table.\n    \"\"\"\n    if not self.columns:\n        print(\"Empty Table\")\n        return\n\n    def datatype(col):  # PRIVATE\n        \"\"\"creates label for column datatype.\"\"\"\n        types = col.types()\n        if len(types) == 0:\n            typ = \"empty\"\n        elif len(types) == 1:\n            dt, _ = types.popitem()\n            typ = dt.__name__\n        else:\n            typ = \"mixed\"\n        return typ\n\n    row_count_tags = [\"#\", \"~\", \"*\"]\n    cols = set(self.columns)\n    for n, tag in product(range(1, 6), row_count_tags):\n        if n * tag not in cols:\n            tag = n * tag\n            break\n\n    if not isinstance(slice_, (slice, type(None))):\n        raise TypeError(f\"slice_ must be None or slice, not {type(slice_)}\")\n    if isinstance(slice_, slice):\n        slc = slice_\n    if slice_ is None:\n        if len(self) &lt;= 20:\n            slc = slice(0, 20, 1)\n        else:\n            slc = None\n\n    n = len(self)\n    if slc:  # either we want slc or we want everything.\n        row_no = list(range(*slc.indices(len(self))))\n        data = {tag: [f\"{i:,}\".rjust(2) for i in row_no]}\n        for name, col in self.columns.items():\n            data[name] = list(chain(iter(col), repeat(blanks, times=n - len(col))))[\n                slc\n            ]\n    else:\n        data = {}\n        j = int(math.ceil(math.log10(n)) / 3) + len(str(n))\n        row_no = (\n            [f\"{i:,}\".rjust(j) for i in range(7)]\n            + [\"...\"]\n            + [f\"{i:,}\".rjust(j) for i in range(n - 7, n)]\n        )\n        data = {tag: row_no}\n\n        for name, col in self.columns.items():\n            if len(col) == n:\n                row = col[:7].tolist() + [\"...\"] + col[-7:].tolist()\n            else:\n                empty = [blanks] * 7\n                head = (col[:7].tolist() + empty)[:7]\n                tail = (col[n - 7 :].tolist() + empty)[-7:]\n                row = head + [\"...\"] + tail\n            data[name] = row\n\n    if dtype:\n        for name, values in data.items():\n            if name in self.columns:\n                col = self.columns[name]\n                values.insert(0, datatype(col))\n            else:\n                values.insert(0, \"row\")\n\n    return data\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_ascii","title":"<code>tablite.core.Table.to_ascii(slice_=None, blanks=None, dtype=False)</code>","text":"<p>returns ascii view of table as string.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def to_ascii(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"returns ascii view of table as string.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n\n    def adjust(v, length):  # PRIVATE FUNCTION\n        \"\"\"whitespace justifies field values based on datatype\"\"\"\n        if v is None:\n            return str(blanks).ljust(length)\n        elif isinstance(v, str):\n            return v.ljust(length)\n        else:\n            return str(v).rjust(length)\n\n    if not self.columns:\n        return str(self)\n\n    d = {}\n    for name, values in self.display_dict(\n        slice_=slice_, blanks=blanks, dtype=dtype\n    ).items():\n        as_text = [str(v) for v in values] + [str(name)]\n        width = max(len(i) for i in as_text)\n        new_name = name.center(width, \" \")\n        if dtype:\n            values[0] = values[0].center(width, \" \")\n        d[new_name] = [adjust(v, width) for v in values]\n\n    rows = dict_to_rows(d)\n    s = []\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n    s.append(\"|\" + \"|\".join(rows[0]) + \"|\")  # column names\n    start = 1\n    if dtype:\n        s.append(\"|\" + \"|\".join(rows[1]) + \"|\")  # datatypes\n        start = 2\n\n    s.append(\"+\" + \"+\".join([\"-\" * len(n) for n in rows[0]]) + \"+\")\n    for row in rows[start:]:\n        s.append(\"|\" + \"|\".join(row) + \"|\")\n    s.append(\"+\" + \"+\".join([\"=\" * len(n) for n in rows[0]]) + \"+\")\n\n    if len(set(len(c) for c in self.columns.values())) != 1:\n        warning = f\"Warning: Columns have different lengths. {blanks} is used as fill value.\"\n        s.append(warning)\n\n    return \"\\n\".join(s)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.show","title":"<code>tablite.core.Table.show(slice_=None, blanks=None, dtype=False)</code>","text":"<p>prints ascii view of table.</p> PARAMETER  DESCRIPTION <code>slice_</code> <p>slice to determine table snippet.</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> <code>blanks</code> <p>value for whitespace. Defaults to None.</p> <p> TYPE: <code>str</code> DEFAULT: <code>None</code> </p> <code>dtype</code> <p>adds subheader with datatype for column. Defaults to False.</p> <p> TYPE: <code>bool</code> DEFAULT: <code>False</code> </p> Source code in <code>tablite/base.py</code> <pre><code>def show(self, slice_=None, blanks=None, dtype=False):\n    \"\"\"prints ascii view of table.\n\n    Args:\n        slice_ (slice, optional): slice to determine table snippet.\n        blanks (str, optional): value for whitespace. Defaults to None.\n        dtype (bool, optional): adds subheader with datatype for column. Defaults to False.\n    \"\"\"\n    print(self.to_ascii(slice_=slice_, blanks=blanks, dtype=dtype))\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_dict","title":"<code>tablite.core.Table.to_dict(columns=None, slice_=None)</code>","text":"<p>columns: list of column names. Default is None == all columns. slice_: slice. Default is None == all rows.</p> <p>returns: dict with columns as keys and lists of values.</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; t.show()\n+===+===+===+\n| # | a | b |\n|row|int|int|\n+---+---+---+\n| 0 |  1|  3|\n| 1 |  2|  4|\n+===+===+===+\n&gt;&gt;&gt; t.to_dict()\n{'a':[1,2], 'b':[3,4]}\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def to_dict(self, columns=None, slice_=None):\n    \"\"\"\n    columns: list of column names. Default is None == all columns.\n    slice_: slice. Default is None == all rows.\n\n    returns: dict with columns as keys and lists of values.\n\n    Example:\n    ```\n    &gt;&gt;&gt; t.show()\n    +===+===+===+\n    | # | a | b |\n    |row|int|int|\n    +---+---+---+\n    | 0 |  1|  3|\n    | 1 |  2|  4|\n    +===+===+===+\n    &gt;&gt;&gt; t.to_dict()\n    {'a':[1,2], 'b':[3,4]}\n    ```\n\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n    assert isinstance(slice_, slice)\n\n    if columns is None:\n        columns = list(self.columns.keys())\n    if not isinstance(columns, list):\n        raise TypeError(\"expected columns as list of strings\")\n\n    return {name: list(self.columns[name][slice_]) for name in columns}\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.as_json_serializable","title":"<code>tablite.core.Table.as_json_serializable(row_count='row id', start_on=1, columns=None, slice_=None)</code>","text":"<p>provides a JSON compatible format of the table.</p> PARAMETER  DESCRIPTION <code>row_count</code> <p>Label for row counts. Defaults to \"row id\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'row id'</code> </p> <code>start_on</code> <p>row counts starts by default on 1.</p> <p> TYPE: <code>int</code> DEFAULT: <code>1</code> </p> <code>columns</code> <p>Column names. Defaults to None which returns all columns.</p> <p> TYPE: <code>list of str</code> DEFAULT: <code>None</code> </p> <code>slice_</code> <p>selector. Defaults to None which returns [:]</p> <p> TYPE: <code>slice</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <p>JSON serializable dict: All python datatypes have been converted to JSON compliant data.</p> Source code in <code>tablite/base.py</code> <pre><code>def as_json_serializable(\n    self, row_count=\"row id\", start_on=1, columns=None, slice_=None\n):\n    \"\"\"provides a JSON compatible format of the table.\n\n    Args:\n        row_count (str, optional): Label for row counts. Defaults to \"row id\".\n        start_on (int, optional): row counts starts by default on 1.\n        columns (list of str, optional): Column names.\n            Defaults to None which returns all columns.\n        slice_ (slice, optional): selector. Defaults to None which returns [:]\n\n    Returns:\n        JSON serializable dict: All python datatypes have been converted to JSON compliant data.\n    \"\"\"\n    if slice_ is None:\n        slice_ = slice(0, len(self))\n\n    assert isinstance(slice_, slice)\n    new = {\"columns\": {}, \"total_rows\": len(self)}\n    if row_count is not None:\n        new[\"columns\"][row_count] = [\n            i + start_on for i in range(*slice_.indices(len(self)))\n        ]\n\n    d = self.to_dict(columns, slice_=slice_)\n    for k, data in d.items():\n        new_k = unique_name(\n            k, new[\"columns\"]\n        )  # used to avoid overwriting the `row id` key.\n        new[\"columns\"][new_k] = [\n            DataTypes.to_json(v) for v in data\n        ]  # deal with non-json datatypes.\n    return new\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.index","title":"<code>tablite.core.Table.index(*args)</code>","text":"<p>param: *args: column names returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; table6 = Table()\n&gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n&gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A')  # single key.\n{('Alice',): [0],\n ('Bob',): [1, 2],\n ('Ben',): [3, 5],\n ('Charlie',): [4],\n ('Albert',): [6]})\n</code></pre> <pre><code>&gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n{('Alice', 'Alison'): [0],\n ('Bob', 'Marley'): [1],\n ('Bob', 'Dylan'): [2],\n ('Ben', 'Affleck'): [3],\n ('Charlie', 'Hepburn'): [4],\n ('Ben', 'Barnes'): [5],\n ('Albert', 'Einstein'): [6]})\n</code></pre> Source code in <code>tablite/base.py</code> <pre><code>def index(self, *args):\n    \"\"\"\n    param: *args: column names\n    returns multikey index on the columns as d[(key tuple, )] = {index1, index2, ...}\n\n    Examples:\n        ```\n        &gt;&gt;&gt; table6 = Table()\n        &gt;&gt;&gt; table6['A'] = ['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben','Albert']\n        &gt;&gt;&gt; table6['B'] = ['Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn', 'Barnes', 'Einstein']\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A')  # single key.\n        {('Alice',): [0],\n         ('Bob',): [1, 2],\n         ('Ben',): [3, 5],\n         ('Charlie',): [4],\n         ('Albert',): [6]})\n        ```\n\n        ```\n        &gt;&gt;&gt; table6.index('A', 'B')  # multiple keys.\n        {('Alice', 'Alison'): [0],\n         ('Bob', 'Marley'): [1],\n         ('Bob', 'Dylan'): [2],\n         ('Ben', 'Affleck'): [3],\n         ('Charlie', 'Hepburn'): [4],\n         ('Ben', 'Barnes'): [5],\n         ('Albert', 'Einstein'): [6]})\n        ```\n\n    \"\"\"\n    idx = defaultdict(list)\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in enumerate(zip(*iterators)):\n        key = tuple(numpy_to_python(k) for k in key)\n        idx[key].append(ix)\n    return idx\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.unique_index","title":"<code>tablite.core.Table.unique_index(*args, tqdm=_tqdm)</code>","text":"<p>generates the index of unique rows given a list of column names</p> PARAMETER  DESCRIPTION <code>*args</code> <p>columns names</p> <p> TYPE: <code>any</code> DEFAULT: <code>()</code> </p> <code>tqdm</code> <p>Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <p>np.array(int64): indices of unique records.</p> Source code in <code>tablite/base.py</code> <pre><code>def unique_index(self, *args, tqdm=_tqdm):\n    \"\"\"generates the index of unique rows given a list of column names\n\n    Args:\n        *args (any): columns names\n        tqdm (tqdm, optional): Defaults to _tqdm.\n\n    Returns:\n        np.array(int64): indices of unique records.\n    \"\"\"\n    if not args:\n        raise ValueError(\"*args (column names) is required\")\n    seen = set()\n    unique = set()\n    iterators = [iter(self.columns[c]) for c in args]\n    for ix, key in tqdm(enumerate(zip(*iterators)), disable=Config.TQDM_DISABLE):\n        key_hash = hash(tuple(numpy_to_python(k) for k in key))\n        if key_hash in seen:\n            continue\n        else:\n            seen.add(key_hash)\n            unique.add(ix)\n    return np.array(sorted(unique))\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_file","title":"<code>tablite.core.Table.from_file(path, columns=None, first_row_has_headers=True, header_row_index=0, encoding=None, start=0, limit=sys.maxsize, sheet=None, guess_datatypes=True, newline='\\n', text_qualifier=None, delimiter=None, strip_leading_and_tailing_whitespace=True, text_escape_openings='', text_escape_closures='', skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -&gt; Table</code>  <code>classmethod</code>","text":"<pre><code>    reads path and imports 1 or more tables\n\n    REQUIRED\n    --------\n    path: pathlib.Path or str\n        selection of filereader uses path.suffix.\n        See `filereaders`.\n\n    OPTIONAL\n    --------\n    columns:\n        None: (default) All columns will be imported.\n        List: only column names from list will be imported (if present in file)\n              e.g. ['A', 'B', 'C', 'D']\n\n              datatype is detected using Datatypes.guess(...)\n              You can try it out with:\n              &gt;&gt; from tablite.datatypes import DataTypes\n              &gt;&gt; DataTypes.guess(['001','100'])\n              [1,100]\n\n              if the format cannot be achieved the read type is kept.\n        Excess column names are ignored.\n\n        HINT: To get the head of file use:\n        &gt;&gt;&gt; from tablite.tools import head\n        &gt;&gt;&gt; head = head(path)\n\n    first_row_has_headers: boolean\n        True: (default) first row is used as column names.\n        False: integers are used as column names.\n\n    encoding: str. Defaults to None (autodetect using n bytes).\n        n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n    start: the first line to be read (default: 0)\n\n    limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n    OPTIONAL FOR EXCEL AND ODS READERS\n    ----------------------------------\n\n    sheet: sheet name to import  (applicable to excel- and ods-reader only)\n        e.g. 'sheet_1'\n        sheets not found excess names are ignored.\n\n    OPTIONAL FOR TEXT READERS\n    -------------------------\n    guess_datatype: bool\n        True: (default) datatypes are guessed using DataTypes.guess(...)\n        False: all data is imported as strings.\n\n    newline: newline character (applicable to text_reader only)\n        str: '\n</code></pre> <p>' (default) or ' '</p> <pre><code>    text_qualifier: character (applicable to text_reader only)\n        None: No text qualifier is used.\n        str: \" or '\n\n    delimiter: character (applicable to text_reader only)\n        None: file suffix is used to determine field delimiter:\n            .txt: \"|\"\n            .csv: \",\",\n            .ssv: \";\"\n            .tsv: \" \" (tab)\n\n    strip_leading_and_tailing_whitespace: bool:\n        True: default\n\n    text_escape_openings: (applicable to text_reader only)\n        None: default\n        str: list of characters such as ([{\n\n    text_escape_closures: (applicable to text_reader only)\n        None: default\n        str: list of characters such as }])\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_file(\n    cls,\n    path,\n    columns=None,\n    first_row_has_headers=True,\n    header_row_index=0,\n    encoding=None,\n    start=0,\n    limit=sys.maxsize,\n    sheet=None,\n    guess_datatypes=True,\n    newline=\"\\n\",\n    text_qualifier=None,\n    delimiter=None,\n    strip_leading_and_tailing_whitespace=True,\n    text_escape_openings=\"\",\n    text_escape_closures=\"\",\n    skip_empty: ValidSkipEmpty=\"NONE\",\n    tqdm=_tqdm,\n) -&gt; \"Table\":\n    \"\"\"\n    reads path and imports 1 or more tables\n\n    REQUIRED\n    --------\n    path: pathlib.Path or str\n        selection of filereader uses path.suffix.\n        See `filereaders`.\n\n    OPTIONAL\n    --------\n    columns:\n        None: (default) All columns will be imported.\n        List: only column names from list will be imported (if present in file)\n              e.g. ['A', 'B', 'C', 'D']\n\n              datatype is detected using Datatypes.guess(...)\n              You can try it out with:\n              &gt;&gt; from tablite.datatypes import DataTypes\n              &gt;&gt; DataTypes.guess(['001','100'])\n              [1,100]\n\n              if the format cannot be achieved the read type is kept.\n        Excess column names are ignored.\n\n        HINT: To get the head of file use:\n        &gt;&gt;&gt; from tablite.tools import head\n        &gt;&gt;&gt; head = head(path)\n\n    first_row_has_headers: boolean\n        True: (default) first row is used as column names.\n        False: integers are used as column names.\n\n    encoding: str. Defaults to None (autodetect using n bytes).\n        n is declared in filereader_utils as ENCODING_GUESS_BYTES\n\n    start: the first line to be read (default: 0)\n\n    limit: the number of lines to be read from start (default sys.maxint ~ 2**63)\n\n    OPTIONAL FOR EXCEL AND ODS READERS\n    ----------------------------------\n\n    sheet: sheet name to import  (applicable to excel- and ods-reader only)\n        e.g. 'sheet_1'\n        sheets not found excess names are ignored.\n\n    OPTIONAL FOR TEXT READERS\n    -------------------------\n    guess_datatype: bool\n        True: (default) datatypes are guessed using DataTypes.guess(...)\n        False: all data is imported as strings.\n\n    newline: newline character (applicable to text_reader only)\n        str: '\\n' (default) or '\\r\\n'\n\n    text_qualifier: character (applicable to text_reader only)\n        None: No text qualifier is used.\n        str: \" or '\n\n    delimiter: character (applicable to text_reader only)\n        None: file suffix is used to determine field delimiter:\n            .txt: \"|\"\n            .csv: \",\",\n            .ssv: \";\"\n            .tsv: \"\\t\" (tab)\n\n    strip_leading_and_tailing_whitespace: bool:\n        True: default\n\n    text_escape_openings: (applicable to text_reader only)\n        None: default\n        str: list of characters such as ([{\n\n    text_escape_closures: (applicable to text_reader only)\n        None: default\n        str: list of characters such as }])\n\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    type_check(path, Path)\n\n    if not path.exists():\n        raise FileNotFoundError(f\"file not found: {path}\")\n\n    if not isinstance(start, int) or not 0 &lt;= start &lt;= sys.maxsize:\n        raise ValueError(f\"start {start} not in range(0,{sys.maxsize})\")\n\n    if not isinstance(limit, int) or not 0 &lt; limit &lt;= sys.maxsize:\n        raise ValueError(f\"limit {limit} not in range(0,{sys.maxsize})\")\n\n    if not isinstance(first_row_has_headers, bool):\n        raise TypeError(\"first_row_has_headers is not bool\")\n\n    import_as = path.suffix\n    if import_as.startswith(\".\"):\n        import_as = import_as[1:]\n\n    reader = import_utils.file_readers.get(import_as, None)\n    if reader is None:\n        raise ValueError(f\"{import_as} is not in supported format: {import_utils.valid_readers}\")\n\n    additional_configs = {\"tqdm\": tqdm}\n    if reader == import_utils.text_reader:\n        # here we inject tqdm, if tqdm is not provided, use generic iterator\n        # fmt:off\n        config = (path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline,\n                  guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty,\n                  delimiter, text_escape_openings, text_escape_closures)\n        # fmt:on\n\n    elif reader == import_utils.from_html:\n        config = (path,)\n    elif reader == import_utils.from_hdf5:\n        config = (path,)\n\n    elif reader == import_utils.excel_reader:\n        # config = path, first_row_has_headers, sheet, columns, start, limit\n        config = (\n            path,\n            first_row_has_headers,\n            header_row_index,\n            sheet,\n            columns,\n            skip_empty,\n            start,\n            limit,\n        )  # if file length changes - re-import.\n\n    if reader == import_utils.ods_reader:\n        # path, first_row_has_headers=True, sheet=None, columns=None, start=0, limit=sys.maxsize,\n        config = (\n            str(path),\n            first_row_has_headers,\n            header_row_index,\n            sheet,\n            columns,\n            skip_empty,\n            start,\n            limit,\n        )  # if file length changes - re-import.\n\n    # At this point the import config seems valid.\n    # Now we check if the file already has been imported.\n\n    # publish the settings\n    return reader(cls, *config, **additional_configs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_pandas","title":"<code>tablite.core.Table.from_pandas(df)</code>  <code>classmethod</code>","text":"<p>Creates Table using pd.to_dict('list')</p> <p>similar to:</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n&gt;&gt;&gt; df\n    a  b\n    0  1  4\n    1  2  5\n    2  3  6\n&gt;&gt;&gt; df.to_dict('list')\n{'a': [1, 2, 3], 'b': [4, 5, 6]}\n&gt;&gt;&gt; t = Table.from_dict(df.to_dict('list))\n&gt;&gt;&gt; t.show()\n    +===+===+===+\n    | # | a | b |\n    |row|int|int|\n    +---+---+---+\n    | 0 |  1|  4|\n    | 1 |  2|  5|\n    | 2 |  3|  6|\n    +===+===+===+\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_pandas(cls, df):\n    \"\"\"\n    Creates Table using pd.to_dict('list')\n\n    similar to:\n    ```\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n    &gt;&gt;&gt; df\n        a  b\n        0  1  4\n        1  2  5\n        2  3  6\n    &gt;&gt;&gt; df.to_dict('list')\n    {'a': [1, 2, 3], 'b': [4, 5, 6]}\n    &gt;&gt;&gt; t = Table.from_dict(df.to_dict('list))\n    &gt;&gt;&gt; t.show()\n        +===+===+===+\n        | # | a | b |\n        |row|int|int|\n        +---+---+---+\n        | 0 |  1|  4|\n        | 1 |  2|  5|\n        | 2 |  3|  6|\n        +===+===+===+\n    ```\n    \"\"\"\n    return import_utils.from_pandas(cls, df)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_hdf5","title":"<code>tablite.core.Table.from_hdf5(path)</code>  <code>classmethod</code>","text":"<p>imports an exported hdf5 table.</p> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_hdf5(cls, path):\n    \"\"\"\n    imports an exported hdf5 table.\n    \"\"\"\n    return import_utils.from_hdf5(cls, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.from_json","title":"<code>tablite.core.Table.from_json(jsn)</code>  <code>classmethod</code>","text":"<p>Imports table exported using .to_json</p> Source code in <code>tablite/core.py</code> <pre><code>@classmethod\ndef from_json(cls, jsn):\n    \"\"\"\n    Imports table exported using .to_json\n    \"\"\"\n    return import_utils.from_json(cls, jsn)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_hdf5","title":"<code>tablite.core.Table.to_hdf5(path)</code>","text":"<p>creates a copy of the table as hdf5</p> Source code in <code>tablite/core.py</code> <pre><code>def to_hdf5(self, path):\n    \"\"\"\n    creates a copy of the table as hdf5\n    \"\"\"\n    export_utils.to_hdf5(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_pandas","title":"<code>tablite.core.Table.to_pandas()</code>","text":"<p>returns pandas.DataFrame</p> Source code in <code>tablite/core.py</code> <pre><code>def to_pandas(self):\n    \"\"\"\n    returns pandas.DataFrame\n    \"\"\"\n    return export_utils.to_pandas(self)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_sql","title":"<code>tablite.core.Table.to_sql(name)</code>","text":"<p>generates ANSI-92 compliant SQL.</p> Source code in <code>tablite/core.py</code> <pre><code>def to_sql(self, name):\n    \"\"\"\n    generates ANSI-92 compliant SQL.\n    \"\"\"\n    return export_utils.to_sql(self, name)  # remove after update to test suite.\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_json","title":"<code>tablite.core.Table.to_json()</code>","text":"<p>returns JSON</p> Source code in <code>tablite/core.py</code> <pre><code>def to_json(self):\n    \"\"\"\n    returns JSON\n    \"\"\"\n    return export_utils.to_json(self)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_xlsx","title":"<code>tablite.core.Table.to_xlsx(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_xlsx(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".xlsx\")\n    export_utils.excel_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_ods","title":"<code>tablite.core.Table.to_ods(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_ods(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".ods\")\n    export_utils.excel_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_csv","title":"<code>tablite.core.Table.to_csv(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_csv(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".csv\")\n    export_utils.text_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_tsv","title":"<code>tablite.core.Table.to_tsv(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_tsv(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".tsv\")\n    export_utils.text_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_text","title":"<code>tablite.core.Table.to_text(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_text(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".txt\")\n    export_utils.text_writer(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.to_html","title":"<code>tablite.core.Table.to_html(path)</code>","text":"<p>exports table to path</p> Source code in <code>tablite/core.py</code> <pre><code>def to_html(self, path):\n    \"\"\"\n    exports table to path\n    \"\"\"\n    export_utils.path_suffix_check(path, \".html\")\n    export_utils.to_html(self, path)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.expression","title":"<code>tablite.core.Table.expression(expression)</code>","text":"<p>filters based on an expression, such as:</p> <pre><code>\"all((A==B, C!=4, 200&lt;D))\"\n</code></pre> <p>which is interpreted using python's compiler to:</p> <pre><code>def _f(A,B,C,D):\n    return all((A==B, C!=4, 200&lt;D))\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def expression(self, expression):\n    \"\"\"\n    filters based on an expression, such as:\n\n        \"all((A==B, C!=4, 200&lt;D))\"\n\n    which is interpreted using python's compiler to:\n\n        def _f(A,B,C,D):\n            return all((A==B, C!=4, 200&lt;D))\n    \"\"\"\n    return redux._filter_using_expression(self, expression)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.filter","title":"<code>tablite.core.Table.filter(expressions, filter_type='all', tqdm=_tqdm)</code>","text":"<p>enables filtering across columns for multiple criteria.</p> <p>expressions:</p> <pre><code>str: Expression that can be compiled and executed row by row.\n    exampLe: \"all((A==B and C!=4 and 200&lt;D))\"\n\nlist of dicts: (example):\n\n    L = [\n        {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n        {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n        {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n    ]\n\naccepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n</code></pre> <p>filter_type: 'all' or 'any'</p> Source code in <code>tablite/core.py</code> <pre><code>def filter(self, expressions, filter_type=\"all\", tqdm=_tqdm):\n    \"\"\"\n    enables filtering across columns for multiple criteria.\n\n    expressions:\n\n        str: Expression that can be compiled and executed row by row.\n            exampLe: \"all((A==B and C!=4 and 200&lt;D))\"\n\n        list of dicts: (example):\n\n            L = [\n                {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n                {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n                {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n            ]\n\n        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n    filter_type: 'all' or 'any'\n    \"\"\"\n    return redux.filter(self, expressions, filter_type, tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.sort_index","title":"<code>tablite.core.Table.sort_index(sort_mode='excel', tqdm=_tqdm, pbar=None, **kwargs)</code>","text":"<p>helper for methods <code>sort</code> and <code>is_sorted</code></p> <p>param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort()</p> Source code in <code>tablite/core.py</code> <pre><code>def sort_index(self, sort_mode=\"excel\", tqdm=_tqdm, pbar=None, **kwargs):\n    \"\"\"\n    helper for methods `sort` and `is_sorted`\n\n    param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n    param: **kwargs: sort criteria. See Table.sort()\n    \"\"\"\n    return sortation.sort_index(self, sort_mode, tqdm=tqdm, pbar=pbar, **kwargs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.reindex","title":"<code>tablite.core.Table.reindex(index)</code>","text":"<p>index: list of integers that declare sort order.</p> <p>Examples:</p> <pre><code>Table:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def reindex(self, index):\n    \"\"\"\n    index: list of integers that declare sort order.\n\n    Examples:\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6]\n        result: ['b','d','f','h']\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6,1,3,5,7]\n        result: ['a','c','e','g','b','d','f','h']\n\n    \"\"\"\n    if isinstance(index, list):\n        index = np.array(index)\n    return _reindex.reindex(self, index)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.drop_duplicates","title":"<code>tablite.core.Table.drop_duplicates(*args)</code>","text":"<p>removes duplicate rows based on column names</p> <p>args: (optional) column_names if no args, all columns are used.</p> Source code in <code>tablite/core.py</code> <pre><code>def drop_duplicates(self, *args):\n    \"\"\"\n    removes duplicate rows based on column names\n\n    args: (optional) column_names\n    if no args, all columns are used.\n    \"\"\"\n    if not args:\n        args = self.columns\n    index = self.unique_index(*args)\n    return self.reindex(index)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.sort","title":"<code>tablite.core.Table.sort(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None)</code>","text":"<p>Perform multi-pass sorting with precedence given order of column names.</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>keys as columns,             values as boolean for 'reverse'</p> <p> TYPE: <code>dict</code> </p> <code>sort_mode</code> <p>str: \"alphanumeric\", \"unix\", or, \"excel\"</p> <p> DEFAULT: <code>'excel'</code> </p> RETURNS DESCRIPTION <code>None</code> <p>Table.sort is sorted inplace</p> <p>Examples: Table.sort(mappinp={A':False}) means sort by 'A' in ascending order. Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority) sort B in ascending order.</p> Source code in <code>tablite/core.py</code> <pre><code>def sort(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n    \"\"\"Perform multi-pass sorting with precedence given order of column names.\n\n    Args:\n        mapping (dict): keys as columns,\n                        values as boolean for 'reverse'\n        sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n\n    Returns:\n        None: Table.sort is sorted inplace\n\n    Examples:\n    Table.sort(mappinp={A':False}) means sort by 'A' in ascending order.\n    Table.sort(mapping={'A':True, 'B':False}) means sort 'A' in descending order, then (2nd priority)\n    sort B in ascending order.\n    \"\"\"\n    new = sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n    self.columns = new.columns\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.sorted","title":"<code>tablite.core.Table.sorted(mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None)</code>","text":"<p>See sort. Sorted returns a new table in contrast to \"sort\", which is in-place.</p> RETURNS DESCRIPTION <p>Table.</p> Source code in <code>tablite/core.py</code> <pre><code>def sorted(self, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n    \"\"\"See sort.\n    Sorted returns a new table in contrast to \"sort\", which is in-place.\n\n    Returns:\n        Table.\n    \"\"\"\n    return sortation.sort(self, mapping, sort_mode, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.is_sorted","title":"<code>tablite.core.Table.is_sorted(mapping, sort_mode='excel')</code>","text":"<p>Performs multi-pass sorting check with precedence given order of column names. **kwargs: optional: sort criteria. See Table.sort() :return bool</p> Source code in <code>tablite/core.py</code> <pre><code>def is_sorted(self, mapping, sort_mode=\"excel\"):\n    \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n    **kwargs: optional: sort criteria. See Table.sort()\n    :return bool\n    \"\"\"\n    return sortation.is_sorted(self, mapping, sort_mode)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.any","title":"<code>tablite.core.Table.any(**kwargs)</code>","text":"<p>returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> Source code in <code>tablite/core.py</code> <pre><code>def any(self, **kwargs):\n    \"\"\"\n    returns Table for rows where ANY kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n    \"\"\"\n    return redux.filter_any(self, **kwargs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.all","title":"<code>tablite.core.Table.all(**kwargs)</code>","text":"<p>returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> <p>Examples:</p> <pre><code>t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n    return x == 4\ndef g(x):\n    return x &lt; 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n    return x&gt;=2\n\ndef i(x):\n    return x&lt;=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def all(self, **kwargs):\n    \"\"\"\n    returns Table for rows where ALL kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n\n    Examples:\n\n        t = Table()\n        t['a'] = [1,2,3,4]\n        t['b'] = [10,20,30,40]\n\n        def f(x):\n            return x == 4\n        def g(x):\n            return x &lt; 20\n\n        t2 = t.any( **{\"a\":f, \"b\":g})\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        t2 = t.any(a=f,b=g)\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        def h(x):\n            return x&gt;=2\n\n        def i(x):\n            return x&lt;=30\n\n        t2 = t.all(a=h,b=i)\n        assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n    \"\"\"\n    return redux.filter_all(self, **kwargs)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.drop","title":"<code>tablite.core.Table.drop(*args)</code>","text":"<p>removes all rows where args are present.</p> <p>Exmaple:</p> <p>t = Table() t['A'] = [1,2,3,None] t['B'] = [None,2,3,4] t2 = t.drop(None) t2'A', t2'B' ([2,3], [2,3])</p> Source code in <code>tablite/core.py</code> <pre><code>def drop(self, *args):\n    \"\"\"\n    removes all rows where args are present.\n\n    Exmaple:\n    &gt;&gt;&gt; t = Table()\n    &gt;&gt;&gt; t['A'] = [1,2,3,None]\n    &gt;&gt;&gt; t['B'] = [None,2,3,4]\n    &gt;&gt;&gt; t2 = t.drop(None)\n    &gt;&gt;&gt; t2['A'][:], t2['B'][:]\n    ([2,3], [2,3])\n\n    \"\"\"\n    if not args:\n        raise ValueError(\"What to drop? None? np.nan? \")\n    return redux.drop(self, *args)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.replace","title":"<code>tablite.core.Table.replace(mapping, columns=None, tqdm=_tqdm, pbar=None)</code>","text":"<p>replaces all mapped keys with values from named columns</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>keys are targets for replacement,             values are replacements.</p> <p> TYPE: <code>dict</code> </p> <code>columns</code> <p>target columns. Defaults to None (all columns)</p> <p> TYPE: <code>list or str</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>ValueError</code> <p>description</p> Source code in <code>tablite/core.py</code> <pre><code>def replace(self, mapping, columns=None, tqdm=_tqdm, pbar=None):\n    \"\"\"replaces all mapped keys with values from named columns\n\n    Args:\n        mapping (dict): keys are targets for replacement,\n                        values are replacements.\n        columns (list or str, optional): target columns.\n            Defaults to None (all columns)\n\n    Raises:\n        ValueError: _description_\n    \"\"\"\n    if columns is None:\n        columns = list(self.columns)\n    if not isinstance(columns, list) and columns in self.columns:\n        columns = [columns]\n    type_check(columns, list)\n    for n in columns:\n        if n not in self.columns:\n            raise ValueError(f\"column not found: {n}\")\n\n    if pbar is None:\n        total = len(columns)\n        pbar = tqdm(total=total, desc=\"replace\", disable=Config.TQDM_DISABLE)\n\n    for name in columns:\n        col = self.columns[name]\n        col.replace(mapping)\n        pbar.update(1)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.groupby","title":"<code>tablite.core.Table.groupby(keys, functions, tqdm=_tqdm, pbar=None)</code>","text":"<p>keys: column names for grouping. functions: [optional] list of column names and group functions (See GroupyBy class) returns: table</p> <p>Example:</p> <pre><code>t = Table()\nt.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\nt.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\nt.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\nt.show()\n+=====+=====+=====+\n|  A  |  B  |  C  |\n| int | int | int |\n+-----+-----+-----+\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n+=====+=====+=====+\n\ng = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\ng.show()\n+===+===+===+======+\n| # | A | C |Sum(B)|\n|row|int|int| int  |\n+---+---+---+------+\n|0  |  1|  6|     2|\n|1  |  1|  5|     4|\n|2  |  2|  4|     6|\n|3  |  2|  3|     8|\n|4  |  3|  2|    10|\n|5  |  3|  1|    12|\n+===+===+===+======+\n</code></pre> <p>Cheat sheet:</p> <p>list of unique values</p> <pre><code>&gt;&gt;&gt; g1 = t.groupby(keys=['A'], functions=[])\n&gt;&gt;&gt; g1['A'][:]\n[1,2,3]\n</code></pre> <p>alternatively:</p> <p>t['A'].unique() [1,2,3]</p> <p>list of unique values, grouped by longest combination.</p> <pre><code>&gt;&gt;&gt; g2 = t.groupby(keys=['A', 'B'], functions=[])\n&gt;&gt;&gt; g2['A'][:], g2['B'][:]\n([1,1,2,2,3,3], [1,2,3,4,5,6])\n</code></pre> <p>alternatively:</p> <pre><code>&gt;&gt;&gt; list(zip(*t.index('A', 'B').keys()))\n[(1,1,2,2,3,3) (1,2,3,4,5,6)]\n</code></pre> <p>A key (unique values) and count hereof.</p> <pre><code>&gt;&gt;&gt; g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n&gt;&gt;&gt; g3['A'][:], g3['Count(A)'][:]\n([1,2,3], [4,4,4])\n</code></pre> <p>alternatively:</p> <pre><code>&gt;&gt;&gt; t['A'].histogram()\n([1,2,3], [4,4,4])\n</code></pre> <p>for more exmaples see:     https://github.com/root-11/tablite/blob/master/tests/test_groupby.py</p> Source code in <code>tablite/core.py</code> <pre><code>def groupby(self, keys, functions, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    keys: column names for grouping.\n    functions: [optional] list of column names and group functions (See GroupyBy class)\n    returns: table\n\n    Example:\n    ```\n    t = Table()\n    t.add_column('A', data=[1, 1, 2, 2, 3, 3] * 2)\n    t.add_column('B', data=[1, 2, 3, 4, 5, 6] * 2)\n    t.add_column('C', data=[6, 5, 4, 3, 2, 1] * 2)\n\n    t.show()\n    +=====+=====+=====+\n    |  A  |  B  |  C  |\n    | int | int | int |\n    +-----+-----+-----+\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    +=====+=====+=====+\n\n    g = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])\n    g.show()\n    +===+===+===+======+\n    | # | A | C |Sum(B)|\n    |row|int|int| int  |\n    +---+---+---+------+\n    |0  |  1|  6|     2|\n    |1  |  1|  5|     4|\n    |2  |  2|  4|     6|\n    |3  |  2|  3|     8|\n    |4  |  3|  2|    10|\n    |5  |  3|  1|    12|\n    +===+===+===+======+\n    ```\n    Cheat sheet:\n\n    list of unique values\n    ```\n    &gt;&gt;&gt; g1 = t.groupby(keys=['A'], functions=[])\n    &gt;&gt;&gt; g1['A'][:]\n    [1,2,3]\n    ```\n    alternatively:\n    &gt;&gt;&gt; t['A'].unique()\n    [1,2,3]\n\n    list of unique values, grouped by longest combination.\n    ```\n    &gt;&gt;&gt; g2 = t.groupby(keys=['A', 'B'], functions=[])\n    &gt;&gt;&gt; g2['A'][:], g2['B'][:]\n    ([1,1,2,2,3,3], [1,2,3,4,5,6])\n    ```\n    alternatively:\n    ```\n    &gt;&gt;&gt; list(zip(*t.index('A', 'B').keys()))\n    [(1,1,2,2,3,3) (1,2,3,4,5,6)]\n    ```\n    A key (unique values) and count hereof.\n    ```\n    &gt;&gt;&gt; g3 = t.groupby(keys=['A'], functions=[('A', gb.count)])\n    &gt;&gt;&gt; g3['A'][:], g3['Count(A)'][:]\n    ([1,2,3], [4,4,4])\n    ```\n    alternatively:\n    ```\n    &gt;&gt;&gt; t['A'].histogram()\n    ([1,2,3], [4,4,4])\n    ```\n    for more exmaples see:\n        https://github.com/root-11/tablite/blob/master/tests/test_groupby.py\n\n    \"\"\"\n    return _groupby(self, keys, functions, tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.pivot","title":"<code>tablite.core.Table.pivot(rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None)</code>","text":"<p>param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as</p> <p>example:</p> <pre><code>t.show()\n+=====+=====+=====+\n|  A  |  B  |  C  |\n| int | int | int |\n+-----+-----+-----+\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n+=====+=====+=====+\n\nt2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\nt2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int|  str   |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0  |  6|Sum(B)  |    2|None |None |\n|1  |  5|Sum(B)  |    4|None |None |\n|2  |  4|Sum(B)  |None |    6|None |\n|3  |  3|Sum(B)  |None |    8|None |\n|4  |  2|Sum(B)  |None |None |   10|\n|5  |  1|Sum(B)  |None |None |   12|\n+===+===+========+=====+=====+=====+\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def pivot(self, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    param: rows: column names to keep as rows\n    param: columns: column names to keep as columns\n    param: functions: aggregation functions from the Groupby class as\n\n    example:\n    ```\n    t.show()\n    +=====+=====+=====+\n    |  A  |  B  |  C  |\n    | int | int | int |\n    +-----+-----+-----+\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    +=====+=====+=====+\n\n    t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n    t2.show()\n    +===+===+========+=====+=====+=====+\n    | # | C |function|(A=1)|(A=2)|(A=3)|\n    |row|int|  str   |mixed|mixed|mixed|\n    +---+---+--------+-----+-----+-----+\n    |0  |  6|Sum(B)  |    2|None |None |\n    |1  |  5|Sum(B)  |    4|None |None |\n    |2  |  4|Sum(B)  |None |    6|None |\n    |3  |  3|Sum(B)  |None |    8|None |\n    |4  |  2|Sum(B)  |None |None |   10|\n    |5  |  1|Sum(B)  |None |None |   12|\n    +===+===+========+=====+=====+=====+\n    ```\n    \"\"\"\n    return pivots.pivot(self, rows, columns, functions, values_as_rows, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.merge","title":"<code>tablite.core.Table.merge(left, right, new, criteria)</code>","text":"<p>takes from LEFT where criteria is True else RIGHT. :param: T: Table :param: criteria: np.array(bool):          if True take left column         else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name</p> <p>:returns: T</p> <p>Example:</p> <pre><code>&gt;&gt;&gt; c.show()\n+==+====+====+====+====+\n| #| A  | B  | C  | D  |\n+--+----+----+----+----+\n| 0|   1|  10|   1|  11|\n| 1|   2|  20|   2|  12|\n| 2|   3|None|   3|  13|\n| 3|None|  40|None|None|\n| 4|   5|  50|None|None|\n| 5|None|None|   6|  16|\n| 6|None|None|   7|  17|\n+==+====+====+====+====+\n\n&gt;&gt;&gt; c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n&gt;&gt;&gt; c.show()\n+==+====+====+====+\n| #| B  | D  | E  |\n+--+----+----+----+\n| 0|  10|  11|   1|\n| 1|  20|  12|   2|\n| 2|None|  13|   3|\n| 3|  40|None|None|\n| 4|  50|None|   5|\n| 5|None|  16|   6|\n| 6|None|  17|   7|\n+==+====+====+====+\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def merge(self, left, right, new, criteria):\n    \"\"\" takes from LEFT where criteria is True else RIGHT.\n    :param: T: Table\n    :param: criteria: np.array(bool): \n            if True take left column\n            else take right column\n    :param left: (str) column name\n    :param right: (str) column name\n    :param new: (str) new name\n\n    :returns: T\n\n    Example:\n    ```\n    &gt;&gt;&gt; c.show()\n    +==+====+====+====+====+\n    | #| A  | B  | C  | D  |\n    +--+----+----+----+----+\n    | 0|   1|  10|   1|  11|\n    | 1|   2|  20|   2|  12|\n    | 2|   3|None|   3|  13|\n    | 3|None|  40|None|None|\n    | 4|   5|  50|None|None|\n    | 5|None|None|   6|  16|\n    | 6|None|None|   7|  17|\n    +==+====+====+====+====+\n\n    &gt;&gt;&gt; c.merge(\"A\", \"C\", new=\"E\", criteria=[v != None for v in c['A']])\n    &gt;&gt;&gt; c.show()\n    +==+====+====+====+\n    | #| B  | D  | E  |\n    +--+----+----+----+\n    | 0|  10|  11|   1|\n    | 1|  20|  12|   2|\n    | 2|None|  13|   3|\n    | 3|  40|None|None|\n    | 4|  50|None|   5|\n    | 5|None|  16|   6|\n    | 6|None|  17|   7|\n    +==+====+====+====+\n    ```\n    \"\"\"\n    return merge.where(self, criteria,left,right,new)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.column_select","title":"<code>tablite.core.Table.column_select(cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager)</code>","text":"<p>type-casts columns from a given table to specified type(s)</p> cols <p>list of dicts: (example):</p> <pre><code>cols = [\n    {'column':'A', 'type': 'bool'},\n    {'column':'B', 'type': 'int', 'allow_empty': True},\n    {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n]\n</code></pre> <p>'column'     : column name of the input table that we want to type-cast 'type'       : type that we want to type-cast the specified column to 'allow_empty': should we allow empty values (None, str('')) through (Default: False) 'rename'     : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None)</p> <p>supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime'</p> <p>if any of the columns is rejected, entire row is rejected</p> <p>tqdm: progressbar constructor TaskManager: TaskManager constructor</p> (TABLE, TABLE) DESCRIPTION <p>first table contains the rows that were successfully cast to desired types</p> <p>second table contains rows that failed to cast + rejection reason</p> Source code in <code>tablite/core.py</code> <pre><code>def column_select(self, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=_TaskManager):\n    \"\"\"\n    type-casts columns from a given table to specified type(s)\n\n    cols:\n        list of dicts: (example):\n\n            cols = [\n                {'column':'A', 'type': 'bool'},\n                {'column':'B', 'type': 'int', 'allow_empty': True},\n                {'column':'B', 'type': 'float', 'allow_empty': False, 'rename': 'C'},\n            ]\n\n        'column'     : column name of the input table that we want to type-cast\n        'type'       : type that we want to type-cast the specified column to\n        'allow_empty': should we allow empty values (None, str('')) through (Default: False)\n        'rename'     : new name of the column, if None will keep the original name, in case of duplicates suffix will be added (Default: None)\n\n        supported types: 'bool', 'int', 'float', 'str', 'date', 'time', 'datetime'\n\n        if any of the columns is rejected, entire row is rejected\n\n    tqdm: progressbar constructor\n    TaskManager: TaskManager constructor\n\n    returns: (Table, Table)\n        first table contains the rows that were successfully cast to desired types\n        second table contains rows that failed to cast + rejection reason\n    \"\"\"\n    return _column_select(self, cols, tqdm, TaskManager)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.join","title":"<code>tablite.core.Table.join(other, left_keys, right_keys, left_columns=None, right_columns=None, kind='inner', merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>short-cut for all join functions. kind: 'inner', 'left', 'outer', 'cross'</p> Source code in <code>tablite/core.py</code> <pre><code>def join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, kind=\"inner\", merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    short-cut for all join functions.\n    kind: 'inner', 'left', 'outer', 'cross'\n    \"\"\"\n    kinds = {\n        \"inner\": self.inner_join,\n        \"left\": self.left_join,\n        \"outer\": self.outer_join,\n        \"cross\": self.cross_join,\n    }\n    if kind not in kinds:\n        raise ValueError(f\"join type unknown: {kind}\")\n    f = kinds.get(kind, None)\n    return f(other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.left_join","title":"<code>tablite.core.Table.left_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>:param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example:</p> <pre><code>SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\nTablite: left_join = numbers.left_join(\n    letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n)\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def left_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    :param other: self, other = (left, right)\n    :param left_keys: list of keys for the join\n    :param right_keys: list of keys for the join\n    :param left_columns: list of left columns to retain, if None, all are retained.\n    :param right_columns: list of right columns to retain, if None, all are retained.\n    :return: new Table\n    Example:\n    ```\n    SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n    Tablite: left_join = numbers.left_join(\n        letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n    )\n    ```\n    \"\"\"\n    return joins.left_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.inner_join","title":"<code>tablite.core.Table.inner_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>:param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example:</p> <pre><code>SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\nTablite: inner_join = numbers.inner_join(\n    letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n    )\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def inner_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    :param other: self, other = (left, right)\n    :param left_keys: list of keys for the join\n    :param right_keys: list of keys for the join\n    :param left_columns: list of left columns to retain, if None, all are retained.\n    :param right_columns: list of right columns to retain, if None, all are retained.\n    :return: new Table\n    Example:\n    ```\n    SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n    Tablite: inner_join = numbers.inner_join(\n        letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n        )\n    ```\n    \"\"\"\n    return joins.inner_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.outer_join","title":"<code>tablite.core.Table.outer_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>:param other: self, other = (left, right) :param left_keys: list of keys for the join :param right_keys: list of keys for the join :param left_columns: list of left columns to retain, if None, all are retained. :param right_columns: list of right columns to retain, if None, all are retained. :return: new Table Example:</p> <pre><code>SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\nTablite: outer_join = numbers.outer_join(\n    letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n    )\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def outer_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    :param other: self, other = (left, right)\n    :param left_keys: list of keys for the join\n    :param right_keys: list of keys for the join\n    :param left_columns: list of left columns to retain, if None, all are retained.\n    :param right_columns: list of right columns to retain, if None, all are retained.\n    :return: new Table\n    Example:\n    ```\n    SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n    Tablite: outer_join = numbers.outer_join(\n        letters, left_keys=['colour'], right_keys=['color'], left_columns=['number'], right_columns=['letter']\n        )\n    ```\n    \"\"\"\n    return joins.outer_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.cross_join","title":"<code>tablite.core.Table.cross_join(other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None)</code>","text":"<p>CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table</p> Source code in <code>tablite/core.py</code> <pre><code>def cross_join(self, other, left_keys, right_keys, left_columns=None, right_columns=None, merge_keys=False, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    CROSS JOIN returns the Cartesian product of rows from tables in the join.\n    In other words, it will produce rows which combine each row from the first table\n    with each row from the second table\n    \"\"\"\n    return joins.cross_join(self, other, left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.lookup","title":"<code>tablite.core.Table.lookup(other, *criteria, all=True, tqdm=_tqdm)</code>","text":"<p>function for looking up values in <code>other</code> according to criteria in ascending order. :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form:     (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=Any</p> <p>OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare.</p> <p>Examples:</p> <pre><code>('column A', \"==\", 'column B')  # comparison of two columns\n('Date', \"&lt;\", DataTypes.date(24,12) )  # value from column 'Date' is before 24/12.\nf = lambda L,R: all( ord(L) &lt; ord(R) )  # uses custom function.\n('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def lookup(self, other, *criteria, all=True, tqdm=_tqdm):\n    \"\"\"function for looking up values in `other` according to criteria in ascending order.\n    :param: other: Table sorted in ascending search order.\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n        (LEFT, OPERATOR, RIGHT)\n    :param: all: boolean: True=ALL, False=Any\n\n    OPERATOR must be a callable that returns a boolean\n    LEFT must be a value that the OPERATOR can compare.\n    RIGHT must be a value that the OPERATOR can compare.\n\n    Examples:\n    ```\n    ('column A', \"==\", 'column B')  # comparison of two columns\n    ('Date', \"&lt;\", DataTypes.date(24,12) )  # value from column 'Date' is before 24/12.\n    f = lambda L,R: all( ord(L) &lt; ord(R) )  # uses custom function.\n    ('text 1', f, 'text 2') value from column 'text 1' is compared with value from column 'text 2'\n    ```\n    \"\"\"\n    return lookup.lookup(self, other, *criteria, all=all, tqdm=tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.match","title":"<code>tablite.core.Table.match(other, *criteria, keep_left=None, keep_right=None)</code>","text":"<p>performs inner join where <code>T</code> matches <code>other</code> and removes rows that do not match.</p> <p>:param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form:</p> <pre><code>(LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n    ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n</code></pre> <p>:param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep.</p> Source code in <code>tablite/core.py</code> <pre><code>def match(self, other, *criteria, keep_left=None, keep_right=None):\n    \"\"\"\n    performs inner join where `T` matches `other` and removes rows that do not match.\n\n    :param: T: Table\n    :param: other: Table\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n        (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n        Example:\n            ('column A', \"==\", 'column B')\n\n        This syntax follows the lookup syntax. See Lookup for details.\n\n    :param: keep_left: list of columns to keep.\n    :param: keep_right: list of right columns to keep.\n    \"\"\"\n    return match.match(self, other, *criteria, keep_left=keep_left, keep_right=keep_right)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.replace_missing_values","title":"<code>tablite.core.Table.replace_missing_values(*args, **kwargs)</code>","text":"Source code in <code>tablite/core.py</code> <pre><code>def replace_missing_values(self, *args, **kwargs):\n    raise AttributeError(\"See imputation\")\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.imputation","title":"<code>tablite.core.Table.imputation(targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm)</code>","text":"<p>In statistics, imputation is the process of replacing missing data with substituted values.</p> <p>See more: https://en.wikipedia.org/wiki/Imputation_(statistics)</p> PARAMETER  DESCRIPTION <code>table</code> <p>source table.</p> <p> TYPE: <code>Table</code> </p> <code>targets</code> <p>column names to find and replace missing values</p> <p> TYPE: <code>str or list of strings</code> </p> <code>missing</code> <p>values to be replaced.</p> <p> TYPE: <code>None or iterable</code> DEFAULT: <code>None</code> </p> <code>method</code> <p>method to be used for replacement. Options:</p> <p>'carry forward':     takes the previous value, and carries forward into fields     where values are missing.     +: quick. Realistic on time series.     -: Can produce strange outliers.</p> <p>'mean':     calculates the column mean (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: doesn't work on text. Causes data set to drift towards the mean.</p> <p>'mode':     calculates the column mode (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: most frequent value becomes over-represented in the sample</p> <p>'nearest neighbour':     calculates normalised distance between items in source columns     selects nearest neighbour and copies value as replacement.     +: works for any datatype.     -: computationally intensive (e.g. slow)</p> <p> TYPE: <code>str</code> DEFAULT: <code>'carry forward'</code> </p> <code>sources</code> <p>NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used.</p> <p> TYPE: <code>list of strings</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>table</code> <p>table with replaced values.</p> Source code in <code>tablite/core.py</code> <pre><code>def imputation(self, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm):\n    \"\"\"\n    In statistics, imputation is the process of replacing missing data with substituted values.\n\n    See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n    Args:\n        table (Table): source table.\n\n        targets (str or list of strings): column names to find and\n            replace missing values\n\n        missing (None or iterable): values to be replaced.\n\n        method (str): method to be used for replacement. Options:\n\n            'carry forward':\n                takes the previous value, and carries forward into fields\n                where values are missing.\n                +: quick. Realistic on time series.\n                -: Can produce strange outliers.\n\n            'mean':\n                calculates the column mean (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: doesn't work on text. Causes data set to drift towards the mean.\n\n            'mode':\n                calculates the column mode (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: most frequent value becomes over-represented in the sample\n\n            'nearest neighbour':\n                calculates normalised distance between items in source columns\n                selects nearest neighbour and copies value as replacement.\n                +: works for any datatype.\n                -: computationally intensive (e.g. slow)\n\n        sources (list of strings): NEAREST NEIGHBOUR ONLY\n            column names to be used during imputation.\n            if None or empty, all columns will be used.\n\n    Returns:\n        table: table with replaced values.\n    \"\"\"\n    return imputation.imputation(self, targets, missing, method, sources, tqdm=tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.transpose","title":"<code>tablite.core.Table.transpose(tqdm=_tqdm)</code>","text":"Source code in <code>tablite/core.py</code> <pre><code>def transpose(self, tqdm=_tqdm):\n    return pivots.transpose(self, tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.pivot_transpose","title":"<code>tablite.core.Table.pivot_transpose(columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm)</code>","text":"<p>Transpose a selection of columns to rows.</p> PARAMETER  DESCRIPTION <code>columns</code> <p>column names to transpose</p> <p> TYPE: <code>list of column names</code> </p> <code>keep</code> <p>column names to keep (repeat)</p> <p> TYPE: <code>list of column names</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>with columns transposed to rows</p> Example <p>transpose columns 1,2 and 3 and transpose the remaining columns, except <code>sum</code>.</p> <p>Input:</p> <pre><code>| col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n| 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n| ...  |      |      |     |     |     |     |     |      |\n\nt.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun      |   456|\n|1234| 2345| 3456| mon      |   567|\n|1244| 2445| 4456| mon      |     7|\n</code></pre> Source code in <code>tablite/core.py</code> <pre><code>def pivot_transpose(self, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n    \"\"\"Transpose a selection of columns to rows.\n\n    Args:\n        columns (list of column names): column names to transpose\n        keep (list of column names): column names to keep (repeat)\n\n    Returns:\n        Table: with columns transposed to rows\n\n    Example:\n        transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n    Input:\n    ```\n    | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n    |------|------|------|-----|-----|-----|-----|-----|------|\n    | 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n    | 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n    | ...  |      |      |     |     |     |     |     |      |\n\n    t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n    Output:\n\n    |col1| col2| col3| transpose| value|\n    |----|-----|-----|----------|------|\n    |1234| 2345| 3456| sun      |   456|\n    |1234| 2345| 3456| mon      |   567|\n    |1244| 2445| 4456| mon      |     7|\n    ```\n    \"\"\"\n    return pivots.pivot_transpose(self, columns, keep, column_name, value_name, tqdm=tqdm)\n</code></pre>"},{"location":"reference/core/#tablite.core.Table.diff","title":"<code>tablite.core.Table.diff(other, columns=None)</code>","text":"<p>compares table self with table other</p> PARAMETER  DESCRIPTION <code>self</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>other</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>columns</code> <p>list of column names to include in comparison. Defaults to None.</p> <p> TYPE: <code>List</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>diff of self and other with diff in columns 1st and 2nd.</p> Source code in <code>tablite/core.py</code> <pre><code>def diff(self, other, columns=None):\n    \"\"\"compares table self with table other\n\n    Args:\n        self (Table): Table\n        other (Table): Table\n        columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n    Returns:\n        Table: diff of self and other with diff in columns 1st and 2nd.\n    \"\"\"\n    return diff.diff(self, other, columns)\n</code></pre>"},{"location":"reference/core/#tablite.core-functions","title":"Functions","text":""},{"location":"reference/core/#tablite.core-modules","title":"Modules","text":""},{"location":"reference/datasets/","title":"Datasets","text":""},{"location":"reference/datasets/#tablite.datasets","title":"<code>tablite.datasets</code>","text":""},{"location":"reference/datasets/#tablite.datasets-classes","title":"Classes","text":""},{"location":"reference/datasets/#tablite.datasets-functions","title":"Functions","text":""},{"location":"reference/datasets/#tablite.datasets.synthetic_order_data","title":"<code>tablite.datasets.synthetic_order_data(rows=100000)</code>","text":"<p>Creates a synthetic dataset for testing that looks like this: (depending on number of rows)</p> <pre><code>+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n|    ~    |   #   |      1      |         2         |  3  | 4 |  5  | 6  | 7 |  8  |  9  |         10        |        11        |\n|   row   |  int  |     int     |      datetime     | int |int| int |str |str|mixed|mixed|       float       |      float       |\n+---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n|0        |      1|1478158906743|2021-10-27 00:00:00|50764|  1|29990|C4-5|APP|21\u00b0  |None | 2.0434376837650046|1.3371665497020444|\n|1        |      2|2271295805011|2021-09-13 00:00:00|50141|  0|10212|C4-5|TAE|None |None |  1.010318612835485| 20.94821610676901|\n|2        |      3|1598726492913|2021-08-19 00:00:00|50527|  0|19416|C3-5|QPV|21\u00b0  |None |  1.463459515469516|  17.4133659842749|\n|3        |      4|1413615572689|2021-11-05 00:00:00|50181|  1|18637|C4-2|GCL|6\u00b0   |ABC  |  2.084002469706324| 0.489481411683505|\n|4        |      5| 245266998048|2021-09-25 00:00:00|50378|  0|29756|C5-4|LGY|6\u00b0   |XYZ  | 0.5141579343276079| 8.550780816571438|\n|5        |      6| 947994853644|2021-10-14 00:00:00|50511|  0| 7890|C2-4|BET|0\u00b0   |XYZ  | 1.1725893606177542| 7.447314130260951|\n|6        |      7|2230693047809|2021-10-07 00:00:00|50987|  1|26742|C1-3|CFP|0\u00b0   |XYZ  | 1.0921267279498004|11.009210185311993|\n|...      |...    |...          |...                |...  |...|...  |... |...|...  |...  |...                |...               |\n|7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883|  1|15687|C3-1|RFR|None |XYZ  | 1.3467185981566827|17.023443485654845|\n|7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152|  0|16556|C4-2|WTC|None |ABC  | 1.1517593924478968| 8.201818634721487|\n|7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008|  1|14590|C1-3|WYM|0\u00b0   |None | 2.1273836233717978|23.295943554889195|\n|7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173|  1|20744|C2-1|ZYO|6\u00b0   |ABC  |  2.482509134693724| 22.25375464857266|\n|7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052|  1|28298|C5-4|XAW|None |XYZ  |0.17923757926558143|23.728160892974252|\n|7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993|  1|24832|C5-2|UDL|None |ABC  |0.08425329763360942|12.707735293126758|\n|7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510|  0|15819|C3-4|IGY|None |ABC  |  1.066241687256579|13.862069804070295|\n+=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n</code></pre> PARAMETER  DESCRIPTION <code>rows</code> <p>number of rows wanted. Defaults to 100_000.</p> <p> TYPE: <code>int</code> DEFAULT: <code>100000</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>Populated table.</p> <p> TYPE: <code>Table</code> </p> Source code in <code>tablite/datasets.py</code> <pre><code>def synthetic_order_data(rows=100_000):\n    \"\"\"Creates a synthetic dataset for testing that looks like this:\n    (depending on number of rows)\n\n    ```\n    +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n    |    ~    |   #   |      1      |         2         |  3  | 4 |  5  | 6  | 7 |  8  |  9  |         10        |        11        |\n    |   row   |  int  |     int     |      datetime     | int |int| int |str |str|mixed|mixed|       float       |      float       |\n    +---------+-------+-------------+-------------------+-----+---+-----+----+---+-----+-----+-------------------+------------------+\n    |0        |      1|1478158906743|2021-10-27 00:00:00|50764|  1|29990|C4-5|APP|21\u00b0  |None | 2.0434376837650046|1.3371665497020444|\n    |1        |      2|2271295805011|2021-09-13 00:00:00|50141|  0|10212|C4-5|TAE|None |None |  1.010318612835485| 20.94821610676901|\n    |2        |      3|1598726492913|2021-08-19 00:00:00|50527|  0|19416|C3-5|QPV|21\u00b0  |None |  1.463459515469516|  17.4133659842749|\n    |3        |      4|1413615572689|2021-11-05 00:00:00|50181|  1|18637|C4-2|GCL|6\u00b0   |ABC  |  2.084002469706324| 0.489481411683505|\n    |4        |      5| 245266998048|2021-09-25 00:00:00|50378|  0|29756|C5-4|LGY|6\u00b0   |XYZ  | 0.5141579343276079| 8.550780816571438|\n    |5        |      6| 947994853644|2021-10-14 00:00:00|50511|  0| 7890|C2-4|BET|0\u00b0   |XYZ  | 1.1725893606177542| 7.447314130260951|\n    |6        |      7|2230693047809|2021-10-07 00:00:00|50987|  1|26742|C1-3|CFP|0\u00b0   |XYZ  | 1.0921267279498004|11.009210185311993|\n    |...      |...    |...          |...                |...  |...|...  |... |...|...  |...  |...                |...               |\n    |7,999,993|7999994|2047223556745|2021-09-03 00:00:00|50883|  1|15687|C3-1|RFR|None |XYZ  | 1.3467185981566827|17.023443485654845|\n    |7,999,994|7999995|1814140654790|2021-08-02 00:00:00|50152|  0|16556|C4-2|WTC|None |ABC  | 1.1517593924478968| 8.201818634721487|\n    |7,999,995|7999996| 155308171103|2021-10-14 00:00:00|50008|  1|14590|C1-3|WYM|0\u00b0   |None | 2.1273836233717978|23.295943554889195|\n    |7,999,996|7999997|1620451532911|2021-12-12 00:00:00|50173|  1|20744|C2-1|ZYO|6\u00b0   |ABC  |  2.482509134693724| 22.25375464857266|\n    |7,999,997|7999998|1248987682094|2021-12-20 00:00:00|50052|  1|28298|C5-4|XAW|None |XYZ  |0.17923757926558143|23.728160892974252|\n    |7,999,998|7999999|1382206732187|2021-11-13 00:00:00|50993|  1|24832|C5-2|UDL|None |ABC  |0.08425329763360942|12.707735293126758|\n    |7,999,999|8000000| 600688069780|2021-09-28 00:00:00|50510|  0|15819|C3-4|IGY|None |ABC  |  1.066241687256579|13.862069804070295|\n    +=========+=======+=============+===================+=====+===+=====+====+===+=====+=====+===================+==================+\n    ```\n\n    Args:\n        rows (int, optional): number of rows wanted. Defaults to 100_000.\n\n    Returns:\n        Table (Table): Populated table.\n    \"\"\"  # noqa\n    rows = int(rows)\n\n    L1 = [\"None\", \"0\u00b0\", \"6\u00b0\", \"21\u00b0\"]\n    L2 = [\"ABC\", \"XYZ\", \"\"]\n\n    t = Table()\n    assert isinstance(t, Table)\n    for page_n in range(math.ceil(rows / Config.PAGE_SIZE)):  # n pages\n        start = (page_n * Config.PAGE_SIZE)\n        end = min(start + Config.PAGE_SIZE, rows)\n        ro = range(start, end)\n\n        t2 = Table()\n        t2[\"#\"] = [v+1 for v in ro]\n        # 1 - mock orderid\n        t2[\"1\"] = [random.randint(18_778_628_504, 2277_772_117_504) for i in ro]\n        # 2 - mock delivery date.\n        t2[\"2\"] = [datetime.fromordinal(random.randint(738000, 738150)).isoformat() for i in ro]\n        # 3 - mock store id.\n        t2[\"3\"] = [random.randint(50000, 51000) for _ in ro]\n        # 4 - random bit.\n        t2[\"4\"] = [random.randint(0, 1) for _ in ro]\n        # 5 - mock product id\n        t2[\"5\"] = [random.randint(3000, 30000) for _ in ro]\n        # 6 - random weird string\n        t2[\"6\"] = [f\"C{random.randint(1, 5)}-{random.randint(1, 5)}\" for _ in ro]\n        # 7 - # random category\n        t2[\"7\"] = [\"\".join(random.choice(ascii_uppercase) for _ in range(3)) for _ in ro]\n        # 8 -random temperature group.\n        t2[\"8\"] = [random.choice(L1) for _ in ro]\n        # 9 - random choice of category\n        t2[\"9\"] = [random.choice(L2) for _ in ro]\n        # 10 - volume?\n        t2[\"10\"] = [random.uniform(0.01, 2.5) for _ in ro]\n        # 11 - units?\n        t2[\"11\"] = [f\"{random.uniform(0.1, 25)}\" for _ in ro]\n\n        if len(t) == 0:\n            t = t2\n        else:\n            t += t2\n\n    return t\n</code></pre>"},{"location":"reference/datatypes/","title":"Datatypes","text":""},{"location":"reference/datatypes/#tablite.datatypes","title":"<code>tablite.datatypes</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.matched_types","title":"<code>tablite.datatypes.matched_types = {int: DataTypes._infer_int, str: DataTypes._infer_str, float: DataTypes._infer_float, bool: DataTypes._infer_bool, date: DataTypes._infer_date, datetime: DataTypes._infer_datetime, time: DataTypes._infer_time}</code>  <code>module-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes-classes","title":"Classes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes","title":"<code>tablite.datatypes.DataTypes</code>","text":"<p>             Bases: <code>object</code></p> <p>DataTypes is the conversion library for all datatypes.</p> <p>It supports any / all python datatypes.</p>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.int","title":"<code>tablite.datatypes.DataTypes.int = int</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.str","title":"<code>tablite.datatypes.DataTypes.str = str</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.float","title":"<code>tablite.datatypes.DataTypes.float = float</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bool","title":"<code>tablite.datatypes.DataTypes.bool = bool</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date","title":"<code>tablite.datatypes.DataTypes.date = date</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime","title":"<code>tablite.datatypes.DataTypes.datetime = datetime</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.time","title":"<code>tablite.datatypes.DataTypes.time = time</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.timedelta","title":"<code>tablite.datatypes.DataTypes.timedelta = timedelta</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.numeric_types","title":"<code>tablite.datatypes.DataTypes.numeric_types = {int, float, date, time, datetime}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch","title":"<code>tablite.datatypes.DataTypes.epoch = datetime(2000, 1, 1, 0, 0, 0, 0, timezone.utc)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.epoch_no_tz","title":"<code>tablite.datatypes.DataTypes.epoch_no_tz = datetime(2000, 1, 1, 0, 0, 0, 0)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.digits","title":"<code>tablite.datatypes.DataTypes.digits = '1234567890'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.decimals","title":"<code>tablite.datatypes.DataTypes.decimals = set('1234567890-+eE.')</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.integers","title":"<code>tablite.datatypes.DataTypes.integers = set('1234567890-+')</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.nones","title":"<code>tablite.datatypes.DataTypes.nones = {'null', 'Null', 'NULL', '#N/A', '#n/a', '', 'None', None, np.nan}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.none_type","title":"<code>tablite.datatypes.DataTypes.none_type = type(None)</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.bytes_functions","title":"<code>tablite.datatypes.DataTypes.bytes_functions = {type(None): b_none, bool: b_bool, int: b_int, float: b_float, str: b_str, bytes: b_bytes, datetime: b_datetime, date: b_date, time: b_time, timedelta: b_timedelta}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code_functions","title":"<code>tablite.datatypes.DataTypes.type_code_functions = {1: _none, 2: _bool, 3: _int, 4: _float, 5: _str, 6: _bytes, 7: _datetime, 8: _date, 9: _time, 10: _timedelta, 11: _unpickle}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pytype_from_type_code","title":"<code>tablite.datatypes.DataTypes.pytype_from_type_code = {1: type(None), 2: bool, 3: int, 4: float, 5: str, 6: bytes, 7: datetime, 8: date, 9: time, 10: timedelta, 11: 'pickled object'}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.date_formats","title":"<code>tablite.datatypes.DataTypes.date_formats = {'NNNN-NN-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-NN': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-NN-N': lambda x: date(*int(i) for i in x.split('-')), 'NNNN-N-N': lambda x: date(*int(i) for i in x.split('-')), 'NN-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-NN-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NN-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'N-N-NNNN': lambda x: date(*[int(i) for i in x.split('-')][::-1]), 'NNNN.NN.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.NN': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.NN.N': lambda x: date(*int(i) for i in x.split('.')), 'NNNN.N.N': lambda x: date(*int(i) for i in x.split('.')), 'NN.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.NN.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NN.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'N.N.NNNN': lambda x: date(*[int(i) for i in x.split('.')][::-1]), 'NNNN/NN/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/NN': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/NN/N': lambda x: date(*int(i) for i in x.split('/')), 'NNNN/N/N': lambda x: date(*int(i) for i in x.split('/')), 'NN/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/NN/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NN/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'N/N/NNNN': lambda x: date(*[int(i) for i in x.split('/')][::-1]), 'NNNN NN NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N NN': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN NN N': lambda x: date(*int(i) for i in x.split(' ')), 'NNNN N N': lambda x: date(*int(i) for i in x.split(' ')), 'NN NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NN N NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'N NN NNNN': lambda x: date(*[int(i) for i in x.split(' ')][::-1]), 'NNNNNNNN': lambda x: date(*(int(x[:4]), int(x[4:6]), int(x[6:])))}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.datetime_formats","title":"<code>tablite.datatypes.DataTypes.datetime_formats = {'NNNN-NN-NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x), 'NNNN-NN-NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN-NN-NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, T=' '), 'NNNN/NN/NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/'), 'NNNN/NN/NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN/NN/NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' '), 'NNNN NN NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' '), 'NNNN NN NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN NN NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd=' ', T=' '), 'NNNN.NN.NNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.'), 'NNNN.NN.NN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NNNN.NN.NN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', T=' '), 'NN-NN-NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN-NN-NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='-', T=' ', day_first=True), 'NN/NN/NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN/NN/NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN/NN/NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', T=' ', day_first=True), 'NN NN NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN NN NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='/', day_first=True), 'NN.NN.NNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNNTNN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NN.NN.NNNN NN:NN': lambda x: DataTypes.pattern_to_datetime(x, ymd='.', day_first=True), 'NNNNNNNNTNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNTNN': lambda x: DataTypes.pattern_to_datetime(x, compact=1), 'NNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNNNNNNN': lambda x: DataTypes.pattern_to_datetime(x, compact=2), 'NNNNNNNNTNN:NN:NN': lambda x: DataTypes.pattern_to_datetime(x, compact=3)}</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.types","title":"<code>tablite.datatypes.DataTypes.types = [datetime, date, time, int, bool, float, str]</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.type_code","title":"<code>tablite.datatypes.DataTypes.type_code(value)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef type_code(cls, value):\n    if type(value) in cls._type_codes:\n        return cls._type_codes[type(value)]\n    elif hasattr(value, \"dtype\"):\n        dtype = pytype(value)\n        return cls._type_codes[dtype]\n    else:\n        return cls._type_codes[\"pickle\"]\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_none","title":"<code>tablite.datatypes.DataTypes.b_none(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_none(v):\n    return b\"None\"\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bool","title":"<code>tablite.datatypes.DataTypes.b_bool(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_bool(v):\n    return bytes(str(v), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_int","title":"<code>tablite.datatypes.DataTypes.b_int(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_int(v):\n    return bytes(str(v), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_float","title":"<code>tablite.datatypes.DataTypes.b_float(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_float(v):\n    return bytes(str(v), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_str","title":"<code>tablite.datatypes.DataTypes.b_str(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_str(v):\n    return v.encode(\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_bytes","title":"<code>tablite.datatypes.DataTypes.b_bytes(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_bytes(v):\n    return v\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_datetime","title":"<code>tablite.datatypes.DataTypes.b_datetime(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_datetime(v):\n    return bytes(v.isoformat(), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_date","title":"<code>tablite.datatypes.DataTypes.b_date(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_date(v):\n    return bytes(v.isoformat(), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_time","title":"<code>tablite.datatypes.DataTypes.b_time(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_time(v):\n    return bytes(v.isoformat(), encoding=\"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_timedelta","title":"<code>tablite.datatypes.DataTypes.b_timedelta(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_timedelta(v):\n    return bytes(str(float(v.days + (v.seconds / (24 * 60 * 60)))), \"utf-8\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.b_pickle","title":"<code>tablite.datatypes.DataTypes.b_pickle(v)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def b_pickle(v):\n    return pickle.dumps(v, protocol=0)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_bytes","title":"<code>tablite.datatypes.DataTypes.to_bytes(v)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef to_bytes(cls, v):\n    if type(v) in cls.bytes_functions:  # it's a python native type\n        f = cls.bytes_functions[type(v)]\n    elif hasattr(v, \"dtype\"):  # it's a numpy/c type.\n        dtype = pytype(v)\n        f = cls.bytes_functions[dtype]\n    else:\n        f = cls.b_pickle\n    return f(v)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_type_code","title":"<code>tablite.datatypes.DataTypes.from_type_code(value, code)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef from_type_code(cls, value, code):\n    f = cls.type_code_functions[code]\n    return f(value)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.pattern_to_datetime","title":"<code>tablite.datatypes.DataTypes.pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False)</code>  <code>staticmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef pattern_to_datetime(iso_string, ymd=None, T=None, compact=0, day_first=False):\n    assert isinstance(iso_string, str)\n    if compact:\n        s = iso_string\n        if compact == 1:  # has T\n            slices = [\n                (0, 4, \"-\"),\n                (4, 6, \"-\"),\n                (6, 8, \"T\"),\n                (9, 11, \":\"),\n                (11, 13, \":\"),\n                (13, len(s), \"\"),\n            ]\n        elif compact == 2:  # has no T.\n            slices = [\n                (0, 4, \"-\"),\n                (4, 6, \"-\"),\n                (6, 8, \"T\"),\n                (8, 10, \":\"),\n                (10, 12, \":\"),\n                (12, len(s), \"\"),\n            ]\n        elif compact == 3:  # has T and :\n            slices = [\n                (0, 4, \"-\"),\n                (4, 6, \"-\"),\n                (6, 8, \"T\"),\n                (9, 11, \":\"),\n                (12, 14, \":\"),\n                (15, len(s), \"\"),\n            ]\n        else:\n            raise TypeError\n        iso_string = \"\".join([s[a:b] + c for a, b, c in slices if b &lt;= len(s)])\n        iso_string = iso_string.rstrip(\":\")\n\n    if day_first:\n        s = iso_string\n        iso_string = \"\".join((s[6:10], \"-\", s[3:5], \"-\", s[0:2], s[10:]))\n\n    if \",\" in iso_string:\n        iso_string = iso_string.replace(\",\", \".\")\n\n    dot = iso_string[::-1].find(\".\")\n    if 0 &lt; dot &lt; 10:\n        ix = len(iso_string) - dot\n        microsecond = int(float(f\"0{iso_string[ix - 1:]}\") * 10**6)\n        # fmt:off\n        iso_string = iso_string[: len(iso_string) - dot] + str(microsecond).rjust(6, \"0\")\n        # fmt:on\n    if ymd:\n        iso_string = iso_string.replace(ymd, \"-\", 2)\n    if T:\n        iso_string = iso_string.replace(T, \"T\")\n    return datetime.fromisoformat(iso_string)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.round","title":"<code>tablite.datatypes.DataTypes.round(value, multiple, up=None)</code>  <code>classmethod</code>","text":"<p>a nicer way to round numbers.</p> PARAMETER  DESCRIPTION <code>value</code> <p>value to be rounded</p> <p> TYPE: <code>(float, integer, datetime)</code> </p> <code>multiple</code> <p>value to be used as the based of rounding. 1) multiple = 1 is the same as rounding to whole integers. 2) multiple = 0.001 is the same as rounding to 3 digits precision. 3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415 4) value = datetime(2022,8,18,11,14,53,440) 5) multiple = timedelta(hours=0.5) 6) xround(value,multiple) is datetime(2022,8,18,11,0)</p> <p> TYPE: <code>(float, integer, timedelta)</code> </p> <code>up</code> <p>None (default) or boolean rounds half, up or down. round(1.6, 1) rounds to 2. round(1.4, 1) rounds to 1. round(1.5, 1, up=True) rounds to 2. round(1.5, 1, up=False) rounds to 1.</p> <p> TYPE: <code>(None, bool)</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <p>float,integer,datetime: rounded value in same type as input.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef round(cls, value, multiple, up=None):\n    \"\"\"a nicer way to round numbers.\n\n    Args:\n        value (float,integer,datetime): value to be rounded\n\n        multiple (float,integer,timedelta): value to be used as the based of rounding.\n            1) multiple = 1 is the same as rounding to whole integers.\n            2) multiple = 0.001 is the same as rounding to 3 digits precision.\n            3) mulitple = 3.1415 is rounding to nearest multiplier of 3.1415\n            4) value = datetime(2022,8,18,11,14,53,440)\n            5) multiple = timedelta(hours=0.5)\n            6) xround(value,multiple) is datetime(2022,8,18,11,0)\n\n        up (None, bool, optional):\n            None (default) or boolean rounds half, up or down.\n            round(1.6, 1) rounds to 2.\n            round(1.4, 1) rounds to 1.\n            round(1.5, 1, up=True) rounds to 2.\n            round(1.5, 1, up=False) rounds to 1.\n\n    Returns:\n        float,integer,datetime: rounded value in same type as input.\n    \"\"\"\n    epoch = 0\n    if isinstance(value, (datetime)) and isinstance(multiple, timedelta):\n        if value.tzinfo is None:\n            epoch = cls.epoch_no_tz\n        else:\n            epoch = cls.epoch\n\n    value2 = value - epoch\n    if value2 == 0:\n        return value2\n\n    low = (value2 // multiple) * multiple\n    high = low + multiple\n    if up is True:\n        return high + epoch\n    elif up is False:\n        return low + epoch\n    else:\n        if abs((high + epoch) - value) &lt; abs(value - (low + epoch)):\n            return high + epoch\n        else:\n            return low + epoch\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.to_json","title":"<code>tablite.datatypes.DataTypes.to_json(v)</code>  <code>staticmethod</code>","text":"<p>converts any python type to json.</p> PARAMETER  DESCRIPTION <code>v</code> <p>value to convert to json</p> <p> TYPE: <code>any</code> </p> RETURNS DESCRIPTION <p>json compatible value from v</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef to_json(v):\n    \"\"\"converts any python type to json.\n\n    Args:\n        v (any): value to convert to json\n\n    Returns:\n        json compatible value from v\n    \"\"\"\n    if hasattr(v, \"dtype\"):\n        v = numpy_to_python(v)\n    if v is None:\n        return v\n    elif v is False:\n        # using isinstance(v, bool): won't work as False also is int of zero.\n        return str(v)\n    elif v is True:\n        return str(v)\n    elif isinstance(v, int):\n        return v\n    elif isinstance(v, str):\n        return v\n    elif isinstance(v, float):\n        return v\n    elif isinstance(v, datetime):\n        return v.isoformat()\n    elif isinstance(v, time):\n        return v.isoformat()\n    elif isinstance(v, date):\n        return v.isoformat()\n    elif isinstance(v, timedelta):\n        return f\"P{v.days}DT{v.seconds + (v.microseconds / 1e6)}S\"\n    else:\n        raise TypeError(f\"The datatype {type(v)} is not supported.\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.from_json","title":"<code>tablite.datatypes.DataTypes.from_json(v, dtype)</code>  <code>staticmethod</code>","text":"<p>converts json to python datatype</p> PARAMETER  DESCRIPTION <code>v</code> <p>value</p> <p> TYPE: <code>any</code> </p> <code>dtype</code> <p>any python type</p> <p> TYPE: <code>python type</code> </p> RETURNS DESCRIPTION <p>python type of value v</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef from_json(v, dtype):\n    \"\"\"converts json to python datatype\n\n    Args:\n        v (any): value\n        dtype (python type): any python type\n\n    Returns:\n        python type of value v\n    \"\"\"\n    if v in DataTypes.nones:\n        if dtype is str and v == \"\":\n            return \"\"\n        else:\n            return None\n    if dtype is int:\n        return int(v)\n    elif dtype is str:\n        return str(v)\n    elif dtype is float:\n        return float(v)\n    elif dtype is bool:\n        if v == \"False\":\n            return False\n        elif v == \"True\":\n            return True\n        else:\n            raise ValueError(v)\n    elif dtype is date:\n        return date.fromisoformat(v)\n    elif dtype is datetime:\n        return datetime.fromisoformat(v)\n    elif dtype is time:\n        return time.fromisoformat(v)\n    elif dtype is timedelta:\n        L = v.split(\"DT\")\n        days = int(L[0].lstrip(\"P\"))\n        seconds = float(L[1].rstrip(\"S\"))\n        return timedelta(days, seconds)\n    else:\n        raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess_types","title":"<code>tablite.datatypes.DataTypes.guess_types(*values)</code>  <code>staticmethod</code>","text":"<p>Attempts to guess the datatype for *values returns dict with matching datatypes and probabilities</p> RETURNS DESCRIPTION <code>dict</code> <p>{key: type, value: probability}</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef guess_types(*values):\n    \"\"\"Attempts to guess the datatype for *values\n    returns dict with matching datatypes and probabilities\n\n    Returns:\n        dict: {key: type, value: probability}\n    \"\"\"\n    d = defaultdict(int)\n    probability = Rank(DataTypes.types[:])\n\n    for value in values:\n        if hasattr(value, \"dtype\"):\n            value = numpy_to_python(value)\n\n        for dtype in probability:\n            try:\n                _ = DataTypes.infer(value, dtype)\n                d[dtype] += 1\n                probability.match(dtype)\n                break\n            except (ValueError, TypeError):\n                pass\n    if not d:\n        d[str] = len(values)\n    return {k: round(v / len(values), 3) for k, v in d.items()}\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.guess","title":"<code>tablite.datatypes.DataTypes.guess(*values)</code>  <code>staticmethod</code>","text":"<p>Makes a best guess the datatype for *values returns list of native python values</p> RETURNS DESCRIPTION <code>list</code> <p>list of native python values</p> Source code in <code>tablite/datatypes.py</code> <pre><code>@staticmethod\ndef guess(*values):\n    \"\"\"Makes a best guess the datatype for *values\n    returns list of native python values\n\n    Returns:\n        list: list of native python values\n    \"\"\"\n    probability = Rank(*DataTypes.types[:])\n    matches = [None for _ in values[0]]\n\n    for ix, value in enumerate(values[0]):\n        if hasattr(value, \"dtype\"):\n            value = numpy_to_python(value)\n        for dtype in probability:\n            try:\n                matches[ix] = DataTypes.infer(value, dtype)\n                probability.match(dtype)\n                break\n            except (ValueError, TypeError):\n                pass\n    return matches\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.DataTypes.infer","title":"<code>tablite.datatypes.DataTypes.infer(v, dtype)</code>  <code>classmethod</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>@classmethod\ndef infer(cls, v, dtype):\n    if isinstance(v, str) and dtype == str:\n        # we got a string, we're trying to infer it to string, we shouldn't check for None-ness\n        return v\n\n    if v in DataTypes.nones:\n        return None\n\n    if dtype not in matched_types:\n        raise TypeError(f\"The datatype {str(dtype)} is not supported.\")\n\n    return matched_types[dtype](v)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.Rank","title":"<code>tablite.datatypes.Rank(*items)</code>","text":"<p>             Bases: <code>object</code></p> Source code in <code>tablite/datatypes.py</code> <pre><code>def __init__(self, *items):\n    self.items = {i: ix for i, ix in zip(items, range(len(items)))}\n    self.ranks = [0 for _ in items]\n    self.items_list = [i for i in items]\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.Rank-attributes","title":"Attributes","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items","title":"<code>tablite.datatypes.Rank.items = {i: ixfor (i, ix) in zip(items, range(len(items)))}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.ranks","title":"<code>tablite.datatypes.Rank.ranks = [0 for _ in items]</code>  <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.items_list","title":"<code>tablite.datatypes.Rank.items_list = [i for i in items]</code>  <code>instance-attribute</code>","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.Rank.match","title":"<code>tablite.datatypes.Rank.match(k)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def match(self, k):  # k+=1\n    ix = self.items[k]\n    r = self.ranks\n    r[ix] += 1\n\n    if ix &gt; 0:\n        p = self.items_list\n        while (\n            r[ix] &gt; r[ix - 1] and ix &gt; 0\n        ):  # use a simple bubble sort to maintain rank\n            r[ix], r[ix - 1] = r[ix - 1], r[ix]\n            p[ix], p[ix - 1] = p[ix - 1], p[ix]\n            old = p[ix]\n            self.items[old] = ix\n            self.items[k] = ix - 1\n            ix -= 1\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.Rank.__iter__","title":"<code>tablite.datatypes.Rank.__iter__()</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def __iter__(self):\n    return iter(self.items_list)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.MetaArray","title":"<code>tablite.datatypes.MetaArray</code>","text":"<p>             Bases: <code>ndarray</code></p> <p>Array with metadata.</p>"},{"location":"reference/datatypes/#tablite.datatypes.MetaArray-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__new__","title":"<code>tablite.datatypes.MetaArray.__new__(array, dtype=None, order=None, **kwargs)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def __new__(cls, array, dtype=None, order=None, **kwargs):\n    obj = np.asarray(array, dtype=dtype, order=order).view(cls)\n    obj.metadata = kwargs\n    return obj\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.MetaArray.__array_finalize__","title":"<code>tablite.datatypes.MetaArray.__array_finalize__(obj)</code>","text":"Source code in <code>tablite/datatypes.py</code> <pre><code>def __array_finalize__(self, obj):\n    if obj is None:\n        return\n    self.metadata = getattr(obj, \"metadata\", None)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes-functions","title":"Functions","text":""},{"location":"reference/datatypes/#tablite.datatypes.numpy_to_python","title":"<code>tablite.datatypes.numpy_to_python(obj: Any) -&gt; Any</code>","text":"<p>Converts numpy types to python types.</p> <p>See https://numpy.org/doc/stable/reference/arrays.scalars.html</p> PARAMETER  DESCRIPTION <code>obj</code> <p>A numpy object</p> <p> TYPE: <code>Any</code> </p> RETURNS DESCRIPTION <code>Any</code> <p>python object: A python object</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def numpy_to_python(obj: Any) -&gt; Any:\n    \"\"\"Converts numpy types to python types.\n\n    See https://numpy.org/doc/stable/reference/arrays.scalars.html\n\n    Args:\n        obj (Any): A numpy object\n\n    Returns:\n        python object: A python object\n    \"\"\"\n    if isinstance(obj, np.generic):\n        return obj.item()\n    return obj\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.pytype","title":"<code>tablite.datatypes.pytype(obj)</code>","text":"<p>Returns the python type of any object</p> PARAMETER  DESCRIPTION <code>obj</code> <p>any numpy or python object</p> <p> TYPE: <code>Any</code> </p> RETURNS DESCRIPTION <code>type</code> <p>type of obj</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def pytype(obj):\n    \"\"\"Returns the python type of any object\n\n    Args:\n        obj (Any): any numpy or python object\n\n    Returns:\n        type: type of obj\n    \"\"\"\n    if isinstance(obj, np.generic):\n        return type(obj.item())\n    return type(obj)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.pytype_from_iterable","title":"<code>tablite.datatypes.pytype_from_iterable(iterable: {tuple, list}) -&gt; {np.dtype, dict}</code>","text":"<p>helper to make correct np array from python types.</p> PARAMETER  DESCRIPTION <code>iterable</code> <p>values to be converted to numpy array.</p> <p> TYPE: <code>(tuple, list)</code> </p> RAISES DESCRIPTION <code>NotImplementedError</code> <p>if datatype is not supported.</p> RETURNS DESCRIPTION <code>{dtype, dict}</code> <p>np.dtype: python type of the iterable.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def pytype_from_iterable(iterable: {tuple, list}) -&gt; {np.dtype, dict}:\n    \"\"\"helper to make correct np array from python types.\n\n    Args:\n        iterable (tuple,list): values to be converted to numpy array.\n\n    Raises:\n        NotImplementedError: if datatype is not supported.\n\n    Returns:\n        np.dtype: python type of the iterable.\n    \"\"\"\n    py_types = {}\n    if isinstance(iterable, (tuple, list)):\n        type_counter = Counter((pytype(v) for v in iterable))\n\n        for k, v in type_counter.items():\n            py_types[k] = v\n\n        if len(py_types) == 0:\n            np_dtype, py_dtype = object, bool\n        elif len(py_types) == 1:\n            py_dtype = list(py_types.keys())[0]\n            if py_dtype == datetime:\n                np_dtype = np.datetime64\n            elif py_dtype == date:\n                np_dtype = np.datetime64\n            elif py_dtype == timedelta:\n                np_dtype = np.timedelta64\n            else:\n                np_dtype = None\n        else:\n            np_dtype = object\n    elif isinstance(iterable, np.ndarray):\n        if iterable.dtype == object:\n            np_dtype = object\n            py_types = dict(Counter((pytype(v) for v in iterable)))\n        else:\n            np_dtype = iterable.dtype\n            if len(iterable) &gt; 0:\n                py_types = {pytype(iterable[0]): len(iterable)}\n            else:\n                py_types = {pytype(np_dtype.type()): len(iterable)}\n    else:\n        raise NotImplementedError(f\"No handler for {type(iterable)}\")\n\n    return np_dtype, py_types\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.list_to_np_array","title":"<code>tablite.datatypes.list_to_np_array(iterable)</code>","text":"<p>helper to make correct np array from python types. Example of problem where numpy turns mixed types into strings.</p> <p>np.array([4, '5']) np.ndarray(['4', '5'])</p> RETURNS DESCRIPTION <p>np.array</p> <p>datatypes</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def list_to_np_array(iterable):\n    \"\"\"helper to make correct np array from python types.\n    Example of problem where numpy turns mixed types into strings.\n    &gt;&gt;&gt; np.array([4, '5'])\n    np.ndarray(['4', '5'])\n\n    returns:\n        np.array\n        datatypes\n    \"\"\"\n    np_dtype, py_dtype = pytype_from_iterable(iterable)\n\n    value = MetaArray(iterable, dtype=np_dtype, py_dtype=py_dtype)\n    return value\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.np_type_unify","title":"<code>tablite.datatypes.np_type_unify(arrays)</code>","text":"<p>unifies numpy types.</p> PARAMETER  DESCRIPTION <code>arrays</code> <p>List of numpy arrays</p> <p> TYPE: <code>list</code> </p> RETURNS DESCRIPTION <p>np.ndarray: numpy array of a single type.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def np_type_unify(arrays):\n    \"\"\"unifies numpy types.\n\n    Args:\n        arrays (list): List of numpy arrays\n\n    Returns:\n        np.ndarray: numpy array of a single type.\n    \"\"\"\n    dtypes = {arr.dtype: len(arr) for arr in arrays}\n    if len(dtypes) == 1:\n        dtype, _ = dtypes.popitem()\n    else:\n        for ix, arr in enumerate(arrays):\n            arrays[ix] = np.array(arr, dtype=object)\n        dtype = object\n    return np.concatenate(arrays, dtype=dtype)\n</code></pre>"},{"location":"reference/datatypes/#tablite.datatypes.multitype_set","title":"<code>tablite.datatypes.multitype_set(arr)</code>","text":"<p>prevents loss of True, False when calling sets.</p> <p>python looses values when called returning a set. Example:</p> <p>{1, True, 0, False}</p> PARAMETER  DESCRIPTION <code>arr</code> <p>iterable of mixed types.</p> <p> TYPE: <code>Iterable</code> </p> RETURNS DESCRIPTION <p>np.array: with unique values.</p> Source code in <code>tablite/datatypes.py</code> <pre><code>def multitype_set(arr):\n    \"\"\"prevents loss of True, False when calling sets.\n\n    python looses values when called returning a set. Example:\n    &gt;&gt;&gt; {1, True, 0, False}\n    {0,1}\n\n    Args:\n        arr (Iterable): iterable of mixed types.\n\n    Returns:\n        np.array: with unique values.\n    \"\"\"\n    L = [(type(v), v) for v in arr]\n    L = list(set(L))\n    L = [v for _, v in L]\n    return np.array(L, dtype=object)\n</code></pre>"},{"location":"reference/diff/","title":"Diff","text":""},{"location":"reference/diff/#tablite.diff","title":"<code>tablite.diff</code>","text":""},{"location":"reference/diff/#tablite.diff-classes","title":"Classes","text":""},{"location":"reference/diff/#tablite.diff-functions","title":"Functions","text":""},{"location":"reference/diff/#tablite.diff.diff","title":"<code>tablite.diff.diff(T, other, columns=None)</code>","text":"<p>compares table self with table other</p> PARAMETER  DESCRIPTION <code>self</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>other</code> <p>Table</p> <p> TYPE: <code>Table</code> </p> <code>columns</code> <p>list of column names to include in comparison. Defaults to None.</p> <p> TYPE: <code>List</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>diff of self and other with diff in columns 1st and 2nd.</p> Source code in <code>tablite/diff.py</code> <pre><code>def diff(T, other, columns=None):\n    \"\"\"compares table self with table other\n\n    Args:\n        self (Table): Table\n        other (Table): Table\n        columns (List, optional): list of column names to include in comparison. Defaults to None.\n\n    Returns:\n        Table: diff of self and other with diff in columns 1st and 2nd.\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    sub_cls_check(other, BaseTable)\n    if columns is None:\n        columns = [name for name in T.columns if name in other.columns]\n    elif isinstance(columns, list) and all(isinstance(i, str) for i in columns):\n        for name in columns:\n            if name not in T.columns:\n                raise ValueError(f\"column '{name}' not found\")\n            if name not in other.columns:\n                raise ValueError(f\"column '{name}' not found\")\n    else:\n        raise TypeError(\"Expected list of column names\")\n\n    t1 = T[columns]\n    if issubclass(type(t1), BaseTable):\n        t1 = [tuple(r) for r in T.rows]\n    else:\n        t1 = list(T)\n    t2 = other[columns]\n    if issubclass(type(t2), BaseTable):\n        t2 = [tuple(r) for r in other.rows]\n    else:\n        t2 = list(other)\n\n    sm = difflib.SequenceMatcher(None, t1, t2)\n    new = type(T)()\n    first = unique_name(\"1st\", columns)\n    second = unique_name(\"2nd\", columns)\n    new.add_columns(*columns + [first, second])\n\n    news = {n: [] for n in new.columns}  # Cache for Work in progress.\n\n    for opc, t1a, t1b, t2a, t2b in sm.get_opcodes():\n        if opc == \"insert\":\n            for name, col in zip(columns, zip(*t2[t2a:t2b])):\n                news[name].extend(col)\n            news[first] += [\"-\"] * (t2b - t2a)\n            news[second] += [\"+\"] * (t2b - t2a)\n\n        elif opc == \"delete\":\n            for name, col in zip(columns, zip(*t1[t1a:t1b])):\n                news[name].extend(col)\n            news[first] += [\"+\"] * (t1b - t1a)\n            news[second] += [\"-\"] * (t1b - t1a)\n\n        elif opc == \"equal\":\n            for name, col in zip(columns, zip(*t2[t2a:t2b])):\n                news[name].extend(col)\n            news[first] += [\"=\"] * (t2b - t2a)\n            news[second] += [\"=\"] * (t2b - t2a)\n\n        elif opc == \"replace\":\n            for name, col in zip(columns, zip(*t2[t2a:t2b])):\n                news[name].extend(col)\n            news[first] += [\"r\"] * (t2b - t2a)\n            news[second] += [\"r\"] * (t2b - t2a)\n\n        else:\n            pass\n\n        # Clear cache to free up memory.\n        if len(news[first]) &gt; Config.PAGE_SIZE == 0:\n            for name, L in news.items():\n                new[name].extend(np.array(L))\n                L.clear()\n\n    for name, L in news.items():\n        new[name].extend(np.array(L))\n        L.clear()\n    return new\n</code></pre>"},{"location":"reference/export_utils/","title":"Export utils","text":""},{"location":"reference/export_utils/#tablite.export_utils","title":"<code>tablite.export_utils</code>","text":""},{"location":"reference/export_utils/#tablite.export_utils-classes","title":"Classes","text":""},{"location":"reference/export_utils/#tablite.export_utils-functions","title":"Functions","text":""},{"location":"reference/export_utils/#tablite.export_utils.to_sql","title":"<code>tablite.export_utils.to_sql(table, name)</code>","text":"<p>generates ANSI-92 compliant SQL.</p> PARAMETER  DESCRIPTION <code>name</code> <p>name of SQL table.</p> <p> TYPE: <code>str</code> </p> Source code in <code>tablite/export_utils.py</code> <pre><code>def to_sql(table, name):\n    \"\"\"\n    generates ANSI-92 compliant SQL.\n\n    args:\n        name (str): name of SQL table.\n    \"\"\"\n    sub_cls_check(table, BaseTable)\n    type_check(name, str)\n\n    prefix = name\n    name = \"T1\"\n    create_table = \"\"\"CREATE TABLE {} ({})\"\"\"\n    columns = []\n    for name, col in table.columns.items():\n        dtype = col.types()\n        if len(dtype) == 1:\n            dtype, _ = dtype.popitem()\n            if dtype is int:\n                dtype = \"INTEGER\"\n            elif dtype is float:\n                dtype = \"REAL\"\n            else:\n                dtype = \"TEXT\"\n        else:\n            dtype = \"TEXT\"\n        definition = f\"{name} {dtype}\"\n        columns.append(definition)\n\n    create_table = create_table.format(prefix, \", \".join(columns))\n\n    # return create_table\n    row_inserts = []\n    for row in table.rows:\n        row_inserts.append(str(tuple([i if i is not None else \"NULL\" for i in row])))\n    row_inserts = f\"INSERT INTO {prefix} VALUES \" + \",\".join(row_inserts)\n    return \"begin; {}; {}; commit;\".format(create_table, row_inserts)\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_pandas","title":"<code>tablite.export_utils.to_pandas(table)</code>","text":"<p>returns pandas.DataFrame</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def to_pandas(table):\n    \"\"\"\n    returns pandas.DataFrame\n    \"\"\"\n    sub_cls_check(table, BaseTable)\n    try:\n        return pd.DataFrame(table.to_dict())  # noqa\n    except ImportError:\n        import pandas as pd  # noqa\n    return pd.DataFrame(table.to_dict())  # noqa\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_hdf5","title":"<code>tablite.export_utils.to_hdf5(table, path)</code>","text":"<p>creates a copy of the table as hdf5</p> <p>Note that some loss of type information is to be expected in columns of mixed type:</p> <p>t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def to_hdf5(table, path):\n    # fmt: off\n    \"\"\"\n    creates a copy of the table as hdf5\n\n    Note that some loss of type information is to be expected in columns of mixed type:\n    &gt;&gt;&gt; t.show(dtype=True)\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int|\n    +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    &gt;&gt;&gt; t.to_hdf5(filename)\n    &gt;&gt;&gt; t2 = Table.from_hdf5(filename)\n    &gt;&gt;&gt; t2.show(dtype=True)\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int|\n    +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8  | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    \"\"\"\n    # fmt: in\n    import h5py\n\n    sub_cls_check(table, BaseTable)\n    type_check(path, Path)\n\n    total = f\"{len(table.columns) * len(table):,}\"  # noqa\n    print(f\"writing {total} records to {path}\", end=\"\")\n\n    with h5py.File(path, \"w\") as f:\n        n = 0\n        for name, col in table.items():\n            try:\n                f.create_dataset(name, data=col[:])  # stored in hdf5 as '/name'\n            except TypeError:\n                f.create_dataset(name, data=[str(i) for i in col[:]])  # stored in hdf5 as '/name'\n            n += 1\n    print(\"... done\")\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.excel_writer","title":"<code>tablite.export_utils.excel_writer(table, path)</code>","text":"<p>writer for excel files.</p> <p>This can create xlsx files beyond Excels. If you're using pyexcel to read the data, you'll see the data is there. If you're using Excel, Excel will stop loading after 1,048,576 rows.</p> <p>See pyexcel for more details: http://docs.pyexcel.org/</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def excel_writer(table, path):\n    \"\"\"\n    writer for excel files.\n\n    This can create xlsx files beyond Excels.\n    If you're using pyexcel to read the data, you'll see the data is there.\n    If you're using Excel, Excel will stop loading after 1,048,576 rows.\n\n    See pyexcel for more details:\n    http://docs.pyexcel.org/\n    \"\"\"\n    import pyexcel\n\n    sub_cls_check(table, BaseTable)\n    type_check(path, Path)\n\n    def gen(table):  # local helper\n        yield table.columns\n        for row in table.rows:\n            yield row\n\n    data = list(gen(table))\n    if path.suffix in [\".xls\", \".ods\"]:\n        data = [\n            [str(v) if (isinstance(v, (int, float)) and abs(v) &gt; 2**32 - 1) else DataTypes.to_json(v) for v in row]\n            for row in data\n        ]\n\n    pyexcel.save_as(array=data, dest_file_name=str(path))\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_json","title":"<code>tablite.export_utils.to_json(table, *args, **kwargs)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def to_json(table, *args, **kwargs):\n    import json\n\n    sub_cls_check(table, BaseTable)\n    return json.dumps(table.as_json_serializable())\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.path_suffix_check","title":"<code>tablite.export_utils.path_suffix_check(path, kind)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def path_suffix_check(path, kind):\n    if not path.suffix == kind:\n        raise ValueError(f\"Suffix mismatch: Expected {kind}, got {path.suffix} in {path.name}\")\n    if not path.parent.exists():\n        raise FileNotFoundError(f\"directory {path.parent} not found.\")\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.text_writer","title":"<code>tablite.export_utils.text_writer(table, path, tqdm=_tqdm)</code>","text":"<p>exports table to csv, tsv or txt dependening on path suffix. follows the JSON norm. text escape is ON for all strings.</p>"},{"location":"reference/export_utils/#tablite.export_utils.text_writer--note","title":"Note:","text":"<p>If the delimiter is present in a string when the string is exported, text-escape is required, as the format otherwise is corrupted. When the file is being written, it is unknown whether any string in a column contrains the delimiter. As text escaping the few strings that may contain the delimiter would lead to an assymmetric format, the safer guess is to text escape all strings.</p> Source code in <code>tablite/export_utils.py</code> <pre><code>def text_writer(table, path, tqdm=_tqdm):\n    \"\"\"exports table to csv, tsv or txt dependening on path suffix.\n    follows the JSON norm. text escape is ON for all strings.\n\n    Note:\n    ----------------------\n    If the delimiter is present in a string when the string is exported,\n    text-escape is required, as the format otherwise is corrupted.\n    When the file is being written, it is unknown whether any string in\n    a column contrains the delimiter. As text escaping the few strings\n    that may contain the delimiter would lead to an assymmetric format,\n    the safer guess is to text escape all strings.\n    \"\"\"\n    sub_cls_check(table, BaseTable)\n    type_check(path, Path)\n\n    def txt(value):  # helper for text writer\n        if value is None:\n            return \"\"  # A column with 1,None,2 must be \"1,,2\".\n        elif isinstance(value, str):\n            # if not (value.startswith('\"') and value.endswith('\"')):\n            #     return f'\"{value}\"'  # this must be escape: \"the quick fox, jumped over the comma\"\n            # else:\n            return value  # this would for example be an empty string: \"\"\n        else:\n            return str(DataTypes.to_json(value))  # this handles datetimes, timedelta, etc.\n\n    delimiters = {\".csv\": \",\", \".tsv\": \"\\t\", \".txt\": \"|\"}\n    delimiter = delimiters.get(path.suffix)\n\n    with path.open(\"w\", encoding=\"utf-8\") as fo:\n        fo.write(delimiter.join(c for c in table.columns) + \"\\n\")\n        for row in tqdm(table.rows, total=len(table), disable=Config.TQDM_DISABLE):\n            fo.write(delimiter.join(txt(c) for c in row) + \"\\n\")\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.sql_writer","title":"<code>tablite.export_utils.sql_writer(table, path)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def sql_writer(table, path):\n    type_check(table, BaseTable)\n    type_check(path, Path)\n    with path.open(\"w\", encoding=\"utf-8\") as fo:\n        fo.write(to_sql(table))\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.json_writer","title":"<code>tablite.export_utils.json_writer(table, path)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def json_writer(table, path):\n    type_check(table, BaseTable)\n    type_check(path, Path)\n    with path.open(\"w\") as fo:\n        fo.write(to_json(table))\n</code></pre>"},{"location":"reference/export_utils/#tablite.export_utils.to_html","title":"<code>tablite.export_utils.to_html(table, path)</code>","text":"Source code in <code>tablite/export_utils.py</code> <pre><code>def to_html(table, path):\n    type_check(table, BaseTable)\n    type_check(path, Path)\n    with path.open(\"w\", encoding=\"utf-8\") as fo:\n        fo.write(table._repr_html_(slice(0, len(table))))\n</code></pre>"},{"location":"reference/file_reader_utils/","title":"File reader utils","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils","title":"<code>tablite.file_reader_utils</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ENCODING_GUESS_BYTES","title":"<code>tablite.file_reader_utils.ENCODING_GUESS_BYTES = 10000</code>  <code>module-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.header_readers","title":"<code>tablite.file_reader_utils.header_readers = {'fods': excel_reader_headers, 'json': excel_reader_headers, 'simple': excel_reader_headers, 'rst': excel_reader_headers, 'mediawiki': excel_reader_headers, 'xlsx': excel_reader_headers, 'xlsm': excel_reader_headers, 'csv': text_reader_headers, 'tsv': text_reader_headers, 'txt': text_reader_headers, 'ods': ods_reader_headers}</code>  <code>module-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-classes","title":"Classes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape","title":"<code>tablite.file_reader_utils.TextEscape(openings='({[', closures=']})', text_qualifier='\"', delimiter=',', strip_leading_and_tailing_whitespace=False)</code>","text":"<p>             Bases: <code>object</code></p> <p>enables parsing of CSV with respecting brackets and text marks.</p> <p>Example: text_escape = TextEscape()  # set up the instance. for line in somefile.readlines():     list_of_words = text_escape(line)  # use the instance.     ...</p> <p>As an example, the Danes and Germans use \" for inches and ' for feet, so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so for this case ( and ) are valid escapes, but \" and ' aren't.</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def __init__(\n    self,\n    openings=\"({[\",\n    closures=\"]})\",\n    text_qualifier='\"',\n    delimiter=\",\",\n    strip_leading_and_tailing_whitespace=False,\n):\n    \"\"\"\n    As an example, the Danes and Germans use \" for inches and ' for feet,\n    so we will see data that contains nail (75 x 4 mm, 3\" x 3/12\"), so\n    for this case ( and ) are valid escapes, but \" and ' aren't.\n\n    \"\"\"\n    if openings is None:\n        openings = [None]\n    elif isinstance(openings, str):\n        self.openings = {c for c in openings}\n    else:\n        raise TypeError(f\"expected str, got {type(openings)}\")\n\n    if closures is None:\n        closures = [None]\n    elif isinstance(closures, str):\n        self.closures = {c for c in closures}\n    else:\n        raise TypeError(f\"expected str, got {type(closures)}\")\n\n    if not isinstance(delimiter, str):\n        raise TypeError(f\"expected str, got {type(delimiter)}\")\n    self.delimiter = delimiter\n    self._delimiter_length = len(delimiter)\n    self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n\n    if text_qualifier is None:\n        pass\n    elif text_qualifier in openings + closures:\n        raise ValueError(\"It's a bad idea to have qoute character appears in openings or closures.\")\n    else:\n        self.qoute = text_qualifier\n\n    if not text_qualifier:\n        if not self.strip_leading_and_tailing_whitespace:\n            self.c = self._call_1\n        else:\n            self.c = self._call_2\n    else:\n        self.c = self._call_3\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-attributes","title":"Attributes","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.openings","title":"<code>tablite.file_reader_utils.TextEscape.openings = {c for c in openings}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.closures","title":"<code>tablite.file_reader_utils.TextEscape.closures = {c for c in closures}</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.delimiter","title":"<code>tablite.file_reader_utils.TextEscape.delimiter = delimiter</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace","title":"<code>tablite.file_reader_utils.TextEscape.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.qoute","title":"<code>tablite.file_reader_utils.TextEscape.qoute = text_qualifier</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.c","title":"<code>tablite.file_reader_utils.TextEscape.c = self._call_1</code>  <code>instance-attribute</code>","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.TextEscape.__call__","title":"<code>tablite.file_reader_utils.TextEscape.__call__(s)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def __call__(self, s):\n    return self.c(s)\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils-functions","title":"Functions","text":""},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.split_by_sequence","title":"<code>tablite.file_reader_utils.split_by_sequence(text, sequence)</code>","text":"<p>helper to split text according to a split sequence.</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def split_by_sequence(text, sequence):\n    \"\"\"helper to split text according to a split sequence.\"\"\"\n    chunks = tuple()\n    for element in sequence:\n        idx = text.find(element)\n        if idx &lt; 0:\n            raise ValueError(f\"'{element}' not in row\")\n        chunk, text = text[:idx], text[len(element) + idx :]\n        chunks += (chunk,)\n    chunks += (text,)  # the remaining text.\n    return chunks\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.detect_seperator","title":"<code>tablite.file_reader_utils.detect_seperator(text)</code>","text":"<p>:param path: pathlib.Path objects :param encoding: file encoding. :return: 1 character.</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def detect_seperator(text):\n    \"\"\"\n    :param path: pathlib.Path objects\n    :param encoding: file encoding.\n    :return: 1 character.\n    \"\"\"\n    # After reviewing the logic in the CSV sniffer, I concluded that all it\n    # really does is to look for a non-text character. As the separator is\n    # determined by the first line, which almost always is a line of headers,\n    # the text characters will be utf-8,16 or ascii letters plus white space.\n    # This leaves the characters ,;:| and \\t as potential separators, with one\n    # exception: files that use whitespace as separator. My logic is therefore\n    # to (1) find the set of characters that intersect with ',;:|\\t' which in\n    # practice is a single character, unless (2) it is empty whereby it must\n    # be whitespace.\n    if len(text) == 0:\n        return None\n    seps = {\",\", \"\\t\", \";\", \":\", \"|\"}.intersection(text)\n    if not seps:\n        if \" \" in text:\n            return \" \"\n        if \"\\n\" in text:\n            return \"\\n\"\n        else:\n            raise ValueError(\"separator not detected\")\n    if len(seps) == 1:\n        return seps.pop()\n    else:\n        frq = [(text.count(i), i) for i in seps]\n        frq.sort(reverse=True)  # most frequent first.\n        return frq[0][-1]\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.text_reader_headers","title":"<code>tablite.file_reader_utils.text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def text_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n    d = {}\n    delimiters = {\n        \".csv\": \",\",\n        \".tsv\": \"\\t\",\n        \".txt\": None,\n    }\n\n    try:\n        with path.open(\"rb\") as fi:\n            rawdata = fi.read(ENCODING_GUESS_BYTES)\n            encoding = chardet.detect(rawdata)[\"encoding\"]\n\n        if delimiter is None:\n            with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n                lines = []\n                for n, line in enumerate(fi, -header_row_index):\n                    if n &lt; 0:\n                        continue\n                    line = line.rstrip(\"\\n\")\n                    lines.append(line)\n                    if n &gt;= linecount:\n                        break  # break on first\n                try:\n                    d[\"delimiter\"] = delimiter = detect_seperator(\"\\n\".join(lines))\n                except ValueError as e:\n                    if e.args == (\"separator not detected\", ):\n                        d[\"delimiter\"] = delimiter = None # this will handle the case of 1 column, 1 row\n                    else:\n                        raise e\n\n        if delimiter is None:\n            d[\"delimiter\"] = delimiter = delimiters[path.suffix]  # pickup the default one\n            d[path.name] = [lines]\n            d[\"is_empty\"] = True  # mark as empty to return an empty table instead of throwing\n        else:\n            kwargs = {}\n\n            if text_qualifier is not None:\n                kwargs[\"text_qualifier\"] = text_qualifier\n                kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n            else:\n                kwargs[\"quoting\"] = \"QUOTE_NONE\"\n\n            d[path.name] = _get_headers(\n                str(path), py_to_nim_encoding(encoding), header_row_index=header_row_index,\n                delimiter=delimiter,\n                linecount=linecount,\n                **kwargs\n            )\n        return d\n    except Exception as e:\n        raise ValueError(f\"can't read {path.suffix}\")\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.excel_reader_headers","title":"<code>tablite.file_reader_utils.excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def excel_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n    d = {}\n    book = openpyxl.open(str(path), read_only=True)\n\n    try:\n        all_sheets = book.sheetnames\n\n        for sheet_name, sheet in ((name, book[name]) for name in all_sheets):\n            fixup_worksheet(sheet)\n            if sheet.max_row is None:\n                max_rows = 0\n            else:\n                max_rows = min(sheet.max_row, linecount + 1)\n            container = [None] * max_rows\n            padding_ends = 0\n            max_column = sheet.max_column\n\n            for i, row_data in enumerate(sheet.iter_rows(0, header_row_index + max_rows, values_only=True), start=-header_row_index):\n                if i &lt; 0:\n                    # NOTE: for some reason `iter_rows` specifying a start row starts reading cells as binary, instead skip the rows that are before our first read row\n                    continue\n\n                # NOTE: text readers do not cast types and give back strings, neither should xlsx reader, can't find documentation if it's possible to ignore this via `iter_rows` instead of casting back to string\n                container[i] = [DataTypes.to_json(v) for v in row_data]\n\n                for j, cell in enumerate(reversed(row_data)):\n                    if cell is None:\n                        continue\n\n                    padding_ends = max(padding_ends, max_column - j)\n\n                    break\n\n            d[sheet_name] = [None if c is None else c[0:padding_ends] for c in container]\n            d[\"delimiter\"] = None\n    finally:\n        book.close()\n\n    return d\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.ods_reader_headers","title":"<code>tablite.file_reader_utils.ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def ods_reader_headers(path, delimiter, header_row_index, text_qualifier, linecount):\n    d = {\n        \"delimiter\": None\n    }\n    sheets = pyexcel.get_book_dict(file_name=str(path))\n\n    for sheet_name, data in sheets.items():\n        lines = [[DataTypes.to_json(v) for v in row] for row in data[header_row_index:header_row_index+linecount]]\n\n        d[sheet_name] = lines\n\n    return d\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_headers","title":"<code>tablite.file_reader_utils.get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10)</code>","text":"<p>file format definition csv     comma separated values tsv     tab separated values csvz        a zip file that contains one or many csv files tsvz        a zip file that contains one or many tsv files xls     a spreadsheet file format created by MS-Excel 97-2003 xlsx        MS-Excel Extensions to the Office Open XML SpreadsheetML File Format. xlsm        an MS-Excel Macro-Enabled Workbook file ods     open document spreadsheet fods        flat open document spreadsheet json        java script object notation html        html table of the data structure simple      simple presentation rst     rStructured Text presentation of the data mediawiki   media wiki table</p> Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def get_headers(path, delimiter=None, header_row_index=0, text_qualifier=None, linecount=10):\n    \"\"\"\n    file format\tdefinition\n    csv\t    comma separated values\n    tsv\t    tab separated values\n    csvz\ta zip file that contains one or many csv files\n    tsvz\ta zip file that contains one or many tsv files\n    xls\t    a spreadsheet file format created by MS-Excel 97-2003\n    xlsx\tMS-Excel Extensions to the Office Open XML SpreadsheetML File Format.\n    xlsm\tan MS-Excel Macro-Enabled Workbook file\n    ods\t    open document spreadsheet\n    fods\tflat open document spreadsheet\n    json\tjava script object notation\n    html\thtml table of the data structure\n    simple\tsimple presentation\n    rst\t    rStructured Text presentation of the data\n    mediawiki\tmedia wiki table\n    \"\"\"\n    if isinstance(path, str):\n        path = Path(path)\n    if not isinstance(path, Path):\n        raise TypeError(\"expected pathlib path.\")\n    if not path.exists():\n        raise FileNotFoundError(str(path))\n    if delimiter is not None:\n        if not isinstance(delimiter, str):\n            raise TypeError(f\"expected str or None, not {type(delimiter)}\")\n\n    kwargs = {\n        \"path\": path,\n        \"delimiter\": delimiter,\n        \"header_row_index\": header_row_index,\n        \"text_qualifier\": text_qualifier,\n        \"linecount\": linecount\n   }\n\n    reader = header_readers.get(path.suffix[1:], None)\n\n    if reader is None:\n        raise TypeError(f\"file format for headers not supported: {path.suffix}\")\n\n    result = reader(**kwargs)\n\n    return result\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_encoding","title":"<code>tablite.file_reader_utils.get_encoding(path, nbytes=ENCODING_GUESS_BYTES)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def get_encoding(path, nbytes=ENCODING_GUESS_BYTES):\n    nbytes = min(nbytes, path.stat().st_size)\n    with path.open(\"rb\") as fi:\n        rawdata = fi.read(nbytes)\n        encoding = chardet.detect(rawdata)[\"encoding\"]\n        if encoding == \"ascii\":  # utf-8 is backwards compatible with ascii\n            return \"utf-8\"  # --   so should the first 10k chars not be enough,\n        return encoding  # --      the utf-8 encoding will still get it right.\n</code></pre>"},{"location":"reference/file_reader_utils/#tablite.file_reader_utils.get_delimiter","title":"<code>tablite.file_reader_utils.get_delimiter(path, encoding)</code>","text":"Source code in <code>tablite/file_reader_utils.py</code> <pre><code>def get_delimiter(path, encoding):\n    with path.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:\n        lines = []\n        for n, line in enumerate(fi):\n            line = line.rstrip(\"\\n\")\n            lines.append(line)\n            if n &gt; 10:\n                break  # break on first\n        delimiter = detect_seperator(\"\\n\".join(lines))\n        if delimiter is None:\n            raise ValueError(\"Delimiter could not be determined\")\n        return delimiter\n</code></pre>"},{"location":"reference/groupby_utils/","title":"Groupby utils","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils","title":"<code>tablite.groupby_utils</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils-classes","title":"Classes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy","title":"<code>tablite.groupby_utils.GroupBy</code>","text":"<p>             Bases: <code>object</code></p>"},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy-attributes","title":"Attributes","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.max","title":"<code>tablite.groupby_utils.GroupBy.max = 'Max'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.min","title":"<code>tablite.groupby_utils.GroupBy.min = 'Min'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.sum","title":"<code>tablite.groupby_utils.GroupBy.sum = 'Sum'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.product","title":"<code>tablite.groupby_utils.GroupBy.product = 'Product'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.first","title":"<code>tablite.groupby_utils.GroupBy.first = 'First'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.last","title":"<code>tablite.groupby_utils.GroupBy.last = 'Last'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count","title":"<code>tablite.groupby_utils.GroupBy.count = 'Count'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.count_unique","title":"<code>tablite.groupby_utils.GroupBy.count_unique = 'CountUnique'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.avg","title":"<code>tablite.groupby_utils.GroupBy.avg = 'Average'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.stdev","title":"<code>tablite.groupby_utils.GroupBy.stdev = 'StandardDeviation'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.median","title":"<code>tablite.groupby_utils.GroupBy.median = 'Median'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/groupby_utils/#tablite.groupby_utils.GroupBy.mode","title":"<code>tablite.groupby_utils.GroupBy.mode = 'Mode'</code>  <code>class-attribute</code> <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/","title":"Import utils","text":""},{"location":"reference/import_utils/#tablite.import_utils","title":"<code>tablite.import_utils</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.file_readers","title":"<code>tablite.import_utils.file_readers = {'fods': excel_reader, 'json': excel_reader, 'html': from_html, 'hdf5': from_hdf5, 'simple': excel_reader, 'rst': excel_reader, 'mediawiki': excel_reader, 'xlsx': excel_reader, 'xls': excel_reader, 'xlsm': excel_reader, 'csv': text_reader, 'tsv': text_reader, 'txt': text_reader, 'ods': ods_reader}</code>  <code>module-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.valid_readers","title":"<code>tablite.import_utils.valid_readers = ','.join(list(file_readers.keys()))</code>  <code>module-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils-classes","title":"Classes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig","title":"<code>tablite.import_utils.TRconfig(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields)</code>","text":"<p>             Bases: <code>object</code></p> Source code in <code>tablite/import_utils.py</code> <pre><code>def __init__(\n    self,\n    source,\n    destination,\n    start,\n    end,\n    guess_datatypes,\n    delimiter,\n    text_qualifier,\n    text_escape_openings,\n    text_escape_closures,\n    strip_leading_and_tailing_whitespace,\n    encoding,\n    newline_offsets,\n    fields\n) -&gt; None:\n    self.source = source\n    self.destination = destination\n    self.start = start\n    self.end = end\n    self.guess_datatypes = guess_datatypes\n    self.delimiter = delimiter\n    self.text_qualifier = text_qualifier\n    self.text_escape_openings = text_escape_openings\n    self.text_escape_closures = text_escape_closures\n    self.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace\n    self.encoding = encoding\n    self.newline_offsets = newline_offsets\n    self.fields = fields\n    type_check(start, int),\n    type_check(end, int),\n    type_check(delimiter, str),\n    type_check(text_qualifier, (str, type(None))),\n    type_check(text_escape_openings, str),\n    type_check(text_escape_closures, str),\n    type_check(encoding, str),\n    type_check(strip_leading_and_tailing_whitespace, bool),\n    type_check(newline_offsets, list)\n    type_check(fields, dict)\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-attributes","title":"Attributes","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.source","title":"<code>tablite.import_utils.TRconfig.source = source</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.destination","title":"<code>tablite.import_utils.TRconfig.destination = destination</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.start","title":"<code>tablite.import_utils.TRconfig.start = start</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.end","title":"<code>tablite.import_utils.TRconfig.end = end</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.guess_datatypes","title":"<code>tablite.import_utils.TRconfig.guess_datatypes = guess_datatypes</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.delimiter","title":"<code>tablite.import_utils.TRconfig.delimiter = delimiter</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_qualifier","title":"<code>tablite.import_utils.TRconfig.text_qualifier = text_qualifier</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_openings","title":"<code>tablite.import_utils.TRconfig.text_escape_openings = text_escape_openings</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.text_escape_closures","title":"<code>tablite.import_utils.TRconfig.text_escape_closures = text_escape_closures</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace","title":"<code>tablite.import_utils.TRconfig.strip_leading_and_tailing_whitespace = strip_leading_and_tailing_whitespace</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.encoding","title":"<code>tablite.import_utils.TRconfig.encoding = encoding</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.newline_offsets","title":"<code>tablite.import_utils.TRconfig.newline_offsets = newline_offsets</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.fields","title":"<code>tablite.import_utils.TRconfig.fields = fields</code>  <code>instance-attribute</code>","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.copy","title":"<code>tablite.import_utils.TRconfig.copy()</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def copy(self):\n    return TRconfig(**self.dict())\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.TRconfig.dict","title":"<code>tablite.import_utils.TRconfig.dict()</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def dict(self):\n    return {k: v for k, v in self.__dict__.items() if not (k.startswith(\"_\") or callable(v))}\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils-functions","title":"Functions","text":""},{"location":"reference/import_utils/#tablite.import_utils.from_pandas","title":"<code>tablite.import_utils.from_pandas(T, df)</code>","text":"<p>Creates Table using pd.to_dict('list')</p> <p>similar to:</p> <p>import pandas as pd df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]}) df     a  b     0  1  4     1  2  5     2  3  6 df.to_dict('list')</p> <p>t = Table.from_dict(df.to_dict('list)) t.show()     +===+===+===+     | # | a | b |     |row|int|int|     +---+---+---+     | 0 |  1|  4|     | 1 |  2|  5|     | 2 |  3|  6|     +===+===+===+</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def from_pandas(T, df):\n    \"\"\"\n    Creates Table using pd.to_dict('list')\n\n    similar to:\n    &gt;&gt;&gt; import pandas as pd\n    &gt;&gt;&gt; df = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6]})\n    &gt;&gt;&gt; df\n        a  b\n        0  1  4\n        1  2  5\n        2  3  6\n    &gt;&gt;&gt; df.to_dict('list')\n    {'a': [1, 2, 3], 'b': [4, 5, 6]}\n\n    &gt;&gt;&gt; t = Table.from_dict(df.to_dict('list))\n    &gt;&gt;&gt; t.show()\n        +===+===+===+\n        | # | a | b |\n        |row|int|int|\n        +---+---+---+\n        | 0 |  1|  4|\n        | 1 |  2|  5|\n        | 2 |  3|  6|\n        +===+===+===+\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n\n    return T(columns=df.to_dict(\"list\"))  # noqa\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.from_hdf5","title":"<code>tablite.import_utils.from_hdf5(T, path, tqdm=_tqdm, pbar=None)</code>","text":"<p>imports an exported hdf5 table.</p> <p>Note that some loss of type information is to be expected in columns of mixed type:</p> <p>t.show(dtype=True) +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int| +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+ t.to_hdf5(filename) t2 = Table.from_hdf5(filename) t2.show(dtype=True) +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+ | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O | |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int| +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+ | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11| | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11| +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def from_hdf5(T, path, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    imports an exported hdf5 table.\n\n    Note that some loss of type information is to be expected in columns of mixed type:\n    &gt;&gt;&gt; t.show(dtype=True)\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  | D  |  E  |  F  |         G         |    H     |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|str |mixed| bool|      datetime     |   date   |  time  |   timedelta   |str|           int           |float|int|\n    +---+---+-----+-----+----+-----+-----+-------------------+----------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|    |None |False|2023-06-09 09:12:06|2023-06-09|09:12:06| 1 day, 0:00:00|b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1|1000|1    | True|2023-06-09 09:12:06|2023-06-09|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+====+=====+=====+===================+==========+========+===============+===+=========================+=====+===+\n    &gt;&gt;&gt; t.to_hdf5(filename)\n    &gt;&gt;&gt; t2 = Table.from_hdf5(filename)\n    &gt;&gt;&gt; t2.show(dtype=True)\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    | # | A |  B  |  C  |  D  |  E  |  F  |         G         |         H         |   I    |       J       | K |            L            |  M  | O |\n    |row|int|mixed|float|mixed|mixed| bool|      datetime     |      datetime     |  time  |      str      |str|           int           |float|int|\n    +---+---+-----+-----+-----+-----+-----+-------------------+-------------------+--------+---------------+---+-------------------------+-----+---+\n    | 0 | -1|None | -1.1|None |None |False|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|1 day, 0:00:00 |b  |-100000000000000000000000|  inf| 11|\n    | 1 |  1|    1|  1.1| 1000|    1| True|2023-06-09 09:12:06|2023-06-09 00:00:00|09:12:06|2 days, 0:06:40|\u55e8 | 100000000000000000000000| -inf|-11|\n    +===+===+=====+=====+=====+=====+=====+===================+===================+========+===============+===+=========================+=====+===+\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n    import h5py\n\n    type_check(path, Path)\n    t = T()\n    with h5py.File(path, \"r\") as h5:\n        for col_name in h5.keys():\n            dset = h5[col_name]\n            arr = np.array(dset[:])\n            if arr.dtype == object:\n                arr = np.array(DataTypes.guess([v.decode(\"utf-8\") for v in arr]))\n            t[col_name] = arr\n    return t\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.from_json","title":"<code>tablite.import_utils.from_json(T, jsn)</code>","text":"<p>Imports tables exported using .to_json</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def from_json(T, jsn):\n    \"\"\"\n    Imports tables exported using .to_json\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n    import json\n\n    type_check(jsn, str)\n    d = json.loads(jsn)\n    return T(columns=d[\"columns\"])\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.from_html","title":"<code>tablite.import_utils.from_html(T, path, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def from_html(T, path, tqdm=_tqdm, pbar=None):\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n    type_check(path, Path)\n\n    if pbar is None:\n        total = path.stat().st_size\n        pbar = tqdm(total=total, desc=\"from_html\", disable=Config.TQDM_DISABLE)\n\n    row_start, row_end = \"&lt;tr&gt;\", \"&lt;/tr&gt;\"\n    value_start, value_end = \"&lt;th&gt;\", \"&lt;/th&gt;\"\n    chunk = \"\"\n    t = None  # will be T()\n    start, end = 0, 0\n    data = {}\n    with path.open(\"r\") as fi:\n        while True:\n            start = chunk.find(row_start, start)  # row tag start\n            end = chunk.find(row_end, end)  # row tag end\n            if start == -1 or end == -1:\n                new = fi.read(100_000)\n                pbar.update(len(new))\n                if new == \"\":\n                    break\n                chunk += new\n                continue\n            # get indices from chunk\n            row = chunk[start + len(row_start) : end]\n            fields = [v.rstrip(value_end) for v in row.split(value_start)]\n            if not data:\n                headers = fields[:]\n                data = {f: [] for f in headers}\n                continue\n            else:\n                for field, header in zip(fields, headers):\n                    data[header].append(field)\n\n            chunk = chunk[end + len(row_end) :]\n\n            if len(data[headers[0]]) == Config.PAGE_SIZE:\n                if t is None:\n                    t = T(columns=data)\n                else:\n                    for k, v in data.items():\n                        t[k].extend(DataTypes.guess(v))\n                data = {f: [] for f in headers}\n\n    for k, v in data.items():\n        t[k].extend(DataTypes.guess(v))\n    return t\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.excel_reader","title":"<code>tablite.import_utils.excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs)</code>","text":"<p>returns Table from excel</p> <p>**kwargs are excess arguments that are ignored.</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, tqdm=_tqdm, **kwargs):\n    \"\"\"\n    returns Table from excel\n\n    **kwargs are excess arguments that are ignored.\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n\n    book = openpyxl.load_workbook(path, read_only=True, data_only=True)\n\n    if sheet is None:  # help the user.\n        \"\"\"\n            If no sheet specified, assume first sheet.\n\n            Reasoning:\n                Pandas ODS reader does that, so this preserves parity and it might be expected by users.\n                If we don't know the sheet name but only have single sheet,\n                    we would need to take extra steps to find out the name of the sheet.\n                We already make assumptions in case of column selection,\n                    when columns are None, we import all of them.\n        \"\"\"\n        sheet = book.sheetnames[0]\n    elif sheet not in book.sheetnames:\n        raise ValueError(f\"sheet not found: {sheet}\")\n\n    if not (isinstance(start, int) and start &gt;= 0):\n        raise ValueError(\"expected start as an integer &gt;=0\")\n    if not (isinstance(limit, int) and limit &gt; 0):\n        raise ValueError(\"expected limit as integer &gt; 0\")\n\n    worksheet = book[sheet]\n    fixup_worksheet(worksheet)\n\n    try:\n        it_header = worksheet.iter_rows(min_row=header_row_index + 1)\n        while True:\n            # get the first row to know our headers or the number of columns\n            row = [c.value for c in next(it_header)]\n            break\n        fields = [str(c) if c is not None else \"\" for c in row] # excel is offset by 1\n    except StopIteration:\n        # excel was empty, return empty table\n        return T()\n\n    if not first_row_has_headers:\n        # since the first row did not contain headers, we use the column count to populate header names\n        fields = [str(i) for i in range(len(fields))]\n\n    if columns is None:\n        # no columns were specified by user to import, that means we import all of the them\n        columns = []\n\n        for f in fields:\n            # fixup the duplicate column names\n            columns.append(unique_name(f, columns))\n\n        field_dict = {k: i for i, k in enumerate(columns)}\n    else:\n        field_dict = {}\n\n        for k, i in ((k, fields.index(k)) for k in columns):\n            # fixup the duplicate column names\n            field_dict[unique_name(k, field_dict.keys())] = i\n\n    # calculate our data rows iterator offset\n    it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1\n\n    # attempt to fetch number of rows in the sheet\n    total_rows = worksheet.max_row\n    real_tqdm = True\n\n    if total_rows is None:\n        # i don't know what causes it but max_row can be None in some cases, so we don't know how large the dataset is\n        total_rows = it_offset + limit\n        real_tqdm = False\n\n    # create the actual data rows iterator\n    it_rows = worksheet.iter_rows(min_row=it_offset, max_row=min(it_offset+limit, total_rows))\n    it_used_indices = list(field_dict.values())\n\n    # filter columns that we're not going to use\n    it_rows_filtered = ([row[idx].value for idx in it_used_indices] for row in it_rows)\n\n    # create page directory\n    workdir = Path(Config.workdir) / Config.pid\n    pagesdir = workdir/\"pages\"\n    pagesdir.mkdir(exist_ok=True, parents=True)\n\n    field_names = list(field_dict.keys())\n    column_count = len(field_names)\n\n    page_fhs = None\n\n    # prepopulate the table with columns\n    table = T()\n    for name in field_names:\n        table[name] = Column(table.path)\n\n    pbar_fname = path.name\n    if len(pbar_fname) &gt; 20:\n        pbar_fname = pbar_fname[0:10] + \"...\" + pbar_fname[-7:]\n\n    if real_tqdm:\n        # we can create a true tqdm progress bar, make one\n        tqdm_iter = tqdm(it_rows_filtered, total=total_rows, desc=f\"importing excel: {pbar_fname}\")\n    else:\n        \"\"\"\n            openpyxls was unable to precalculate the size of the excel for whatever reason\n            forcing recalc would require parsing entire file\n            drop the progress bar in that case, just show iterations\n\n            as an alternative we can use \u03a3=1/x but it just doesn't look good, show iterations per second instead\n        \"\"\"\n        tqdm_iter = tqdm(it_rows_filtered, desc=f\"importing excel: {pbar_fname}\")\n\n    tqdm_iter = iter(tqdm_iter)\n\n    idx = 0\n\n    while True:\n        try:\n            row = next(tqdm_iter)\n        except StopIteration:\n            break # because in some cases we can't know the size of excel to set the upper iterator limit we loop until stop iteration is encountered\n\n        if skip_empty == \"ALL\" and all(v is None for v in row):\n            continue\n        elif skip_empty == \"ANY\" and any(v is None for v in row):\n            continue\n\n        if idx % Config.PAGE_SIZE == 0:\n            if page_fhs is not None:\n                # we reached the max page file size, fix the pages\n                [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n            page_fhs = [None] * column_count\n\n            for cidx in range(column_count):\n                # allocate new pages\n                pg_path = pagesdir / f\"{next(Page.ids)}.npy\"\n                page_fhs[cidx] = open(pg_path, \"wb\")\n\n        for fh, value in zip(page_fhs, row):\n            \"\"\"\n                since excel types are already cast into appropriate type we're going to do two passes per page\n\n                we create our temporary custom format:\n                packed type|packed byte count|packed bytes|...\n\n                available types:\n                    * q - int64\n                    * d - float64\n                    * s - string\n                    * b - boolean\n                    * n - none\n                    * p - pickled (date, time, datetime)\n            \"\"\"\n            dtype = type(value)\n\n            if dtype == int:\n                ptype, bytes_ = b'q', struct.pack('q', value) # pack int as int64\n            elif dtype == float:\n                ptype, bytes_ = b'd', struct.pack('d', value) # pack float as float64\n            elif dtype == str:\n                ptype, bytes_ = b's', value.encode(\"utf-8\")   # pack string\n            elif dtype == bool:\n                ptype, bytes_ = b'b', b'1' if value else b'0' # pack boolean\n            elif value is None:\n                ptype, bytes_ = b'n', b''                     # pack none\n            elif dtype in [date, time, datetime]:\n                ptype, bytes_ = b'p', pkl.dumps(value)        # pack object types via pickle\n            else:\n                raise NotImplementedError()\n\n            byte_count = struct.pack('I', len(bytes_))        # pack our payload size, i doubt payload size can be over uint32\n\n            # dump object to file\n            fh.write(ptype)\n            fh.write(byte_count)\n            fh.write(bytes_)\n\n        idx = idx + 1\n\n    if page_fhs is not None:\n        # we reached end of the loop, fix the pages\n        [_fix_xls_page(table, c, fh) for c, fh in zip(field_names, page_fhs)]\n\n    return table\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.ods_reader","title":"<code>tablite.import_utils.ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty='NONE', start=0, limit=sys.maxsize, **kwargs)</code>","text":"<p>returns Table from .ODS</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=None, columns=None, skip_empty=\"NONE\", start=0, limit=sys.maxsize, **kwargs):\n    \"\"\"\n    returns Table from .ODS\n    \"\"\"\n    if not issubclass(T, BaseTable):\n        raise TypeError(\"Expected subclass of Table\")\n\n    if sheet is None:\n        data = read_excel(str(path), header=None) # selects first sheet\n    else:\n        data = read_excel(str(path), sheet_name=sheet, header=None)\n\n    data[isna(data)] = None  # convert any empty cells to None\n    data = data.to_numpy().tolist() # convert pandas to list\n\n    if skip_empty == \"ALL\" or skip_empty == \"ANY\":\n        \"\"\" filter out all rows based on predicate that come after header row \"\"\"\n        fn_filter = any if skip_empty == \"ALL\" else all # this is intentional\n        data = [\n            row\n            for ridx, row in enumerate(data)\n            if ridx &lt; header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)\n        ]\n\n    data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes\n\n    if not (isinstance(start, int) and start &gt;= 0):\n        raise ValueError(\"expected start as an integer &gt;=0\")\n    if not (isinstance(limit, int) and limit &gt; 0):\n        raise ValueError(\"expected limit as integer &gt; 0\")\n\n    t = T()\n\n    used_columns_names = set()\n    for ix, value in enumerate(data[header_row_index]):\n        if first_row_has_headers:\n            header, start_row_pos = \"\" if value is None else str(value), (1 + header_row_index)\n        else:\n            header, start_row_pos = f\"_{ix + 1}\", (0 + header_row_index)\n\n        if columns is not None:\n            if header not in columns:\n                continue\n\n        unique_column_name = unique_name(str(header), used_columns_names)\n        used_columns_names.add(unique_column_name)\n\n        column_values = data[start_row_pos : start_row_pos + limit, ix]\n\n        t[unique_column_name] = column_values\n    return t\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.text_reader_task","title":"<code>tablite.import_utils.text_reader_task(source, destination, start, end, guess_datatypes, delimiter, text_qualifier, text_escape_openings, text_escape_closures, strip_leading_and_tailing_whitespace, encoding, newline_offsets, fields)</code>","text":"<p>PARALLEL TASK FUNCTION reads columnsname + path[start:limit] into hdf5.</p> <p>source: csv or txt file destination: filename for page. start: int: start of page. end: int: end of page. guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess delimiter: ',' ';' or '|' text_qualifier: str: commonly \" text_escape_openings: str: default: \"({[ text_escape_closures: str: default: ]})\" strip_leading_and_tailing_whitespace: bool encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')</p> Source code in <code>tablite/import_utils.py</code> <pre><code>def text_reader_task(\n    source,\n    destination,\n    start,\n    end,\n    guess_datatypes,\n    delimiter,\n    text_qualifier,\n    text_escape_openings,\n    text_escape_closures,\n    strip_leading_and_tailing_whitespace,\n    encoding,\n    newline_offsets,\n    fields\n):\n    \"\"\"PARALLEL TASK FUNCTION\n    reads columnsname + path[start:limit] into hdf5.\n\n    source: csv or txt file\n    destination: filename for page.\n    start: int: start of page.\n    end: int: end of page.\n    guess_datatypes: bool: if True datatypes will be inferred by datatypes.Datatypes.guess\n    delimiter: ',' ';' or '|'\n    text_qualifier: str: commonly \\\"\n    text_escape_openings: str: default: \"({[\n    text_escape_closures: str: default: ]})\"\n    strip_leading_and_tailing_whitespace: bool\n    encoding: chardet encoding ('utf-8, 'ascii', ..., 'ISO-22022-CN')\n    \"\"\"\n    if isinstance(source, str):\n        source = Path(source)\n    type_check(source, Path)\n    if not source.exists():\n        raise FileNotFoundError(f\"File not found: {source}\")\n    type_check(destination, list)\n\n    # declare CSV dialect.\n    delim = delimiter\n\n    class Dialect(csv.Dialect):\n        delimiter = delim\n        quotechar = '\"' if text_qualifier is None else text_qualifier\n        escapechar = '\\\\'\n        doublequote = True\n        quoting = csv.QUOTE_MINIMAL\n        skipinitialspace = False if strip_leading_and_tailing_whitespace is None else strip_leading_and_tailing_whitespace\n        lineterminator = \"\\n\"\n\n    with source.open(\"r\", encoding=encoding, errors=\"ignore\") as fi:  # --READ\n        fi.seek(newline_offsets[start])\n        reader = csv.reader(fi, dialect=Dialect)\n\n        # if there's an issue with file handlers on windows, we can make a special case for windows where the file is opened on demand and appended instead of opening all handlers at once\n        page_file_handlers = [open(f, mode=\"wb\") for f in destination]\n\n        # identify longest str\n        longest_str = [1 for _ in range(len(destination))]\n        for row in (next(reader) for _ in range(end - start)):\n            for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n                longest_str[idx] = max(longest_str[idx], len(c))\n\n        column_formats = [f\"&lt;U{i}\" for i in longest_str]\n        for idx, cf in enumerate(column_formats):\n            _create_numpy_header(cf, (end - start, ), page_file_handlers[idx])\n\n        # write page arrays to files\n        fi.seek(newline_offsets[start])\n        for row in (next(reader) for _ in range(end - start)):\n            for idx, c in ((fields[idx], c) for idx, c in filter(lambda t: t[0] in fields, enumerate(row))):\n                cbytes = np.asarray(c, dtype=column_formats[idx]).tobytes()\n                page_file_handlers[idx].write(cbytes)\n\n        [phf.close() for phf in page_file_handlers]\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils.text_reader","title":"<code>tablite.import_utils.text_reader(T, path, columns, first_row_has_headers, header_row_index, encoding, start, limit, newline, guess_datatypes, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, delimiter, text_escape_openings, text_escape_closures, tqdm=_tqdm, **kwargs)</code>","text":"Source code in <code>tablite/import_utils.py</code> <pre><code>def text_reader(\n    T,\n    path,\n    columns,\n    first_row_has_headers,\n    header_row_index,\n    encoding,\n    start,\n    limit,\n    newline,\n    guess_datatypes,\n    text_qualifier,\n    strip_leading_and_tailing_whitespace,\n    skip_empty,\n    delimiter,\n    text_escape_openings,\n    text_escape_closures,\n    tqdm=_tqdm,\n    **kwargs,\n):\n    if encoding is None:\n        encoding = get_encoding(path, nbytes=ENCODING_GUESS_BYTES)\n\n    enc = py_to_nim_encoding(encoding)\n    pid = Config.workdir / Config.pid\n    kwargs = {}\n\n    if first_row_has_headers is not None:\n        kwargs[\"first_row_has_headers\"] = first_row_has_headers\n    if header_row_index is not None:\n        kwargs[\"header_row_index\"] = header_row_index\n    if columns is not None:\n        kwargs[\"columns\"] = columns\n    if start is not None:\n        kwargs[\"start\"] = start\n    if limit is not None and limit != sys.maxsize:\n        kwargs[\"limit\"] = limit\n    if guess_datatypes is not None:\n        kwargs[\"guess_datatypes\"] = guess_datatypes\n    if newline is not None:\n        kwargs[\"newline\"] = newline\n    if delimiter is not None:\n        kwargs[\"delimiter\"] = delimiter\n    if text_qualifier is not None:\n        kwargs[\"text_qualifier\"] = text_qualifier\n        kwargs[\"quoting\"] = \"QUOTE_MINIMAL\"\n    else:\n        kwargs[\"quoting\"] = \"QUOTE_NONE\"\n    if strip_leading_and_tailing_whitespace is not None:\n        kwargs[\"strip_leading_and_tailing_whitespace\"] = strip_leading_and_tailing_whitespace\n\n    if skip_empty is None:\n        kwargs[\"skip_empty\"] = \"NONE\"\n    else:\n        kwargs[\"skip_empty\"] = skip_empty\n\n    return nimlite.text_reader(\n        T, pid, path, enc,\n        **kwargs,\n        tqdm=tqdm\n    )\n</code></pre>"},{"location":"reference/import_utils/#tablite.import_utils-modules","title":"Modules","text":""},{"location":"reference/imputation/","title":"Imputation","text":""},{"location":"reference/imputation/#tablite.imputation","title":"<code>tablite.imputation</code>","text":""},{"location":"reference/imputation/#tablite.imputation-classes","title":"Classes","text":""},{"location":"reference/imputation/#tablite.imputation-functions","title":"Functions","text":""},{"location":"reference/imputation/#tablite.imputation.imputation","title":"<code>tablite.imputation.imputation(T, targets, missing=None, method='carry forward', sources=None, tqdm=_tqdm, pbar=None)</code>","text":"<p>In statistics, imputation is the process of replacing missing data with substituted values.</p> <p>See more: https://en.wikipedia.org/wiki/Imputation_(statistics)</p> PARAMETER  DESCRIPTION <code>table</code> <p>source table.</p> <p> TYPE: <code>Table</code> </p> <code>targets</code> <p>column names to find and replace missing values</p> <p> TYPE: <code>str or list of strings</code> </p> <code>missing</code> <p>values to be replaced.</p> <p> TYPE: <code>None or iterable</code> DEFAULT: <code>None</code> </p> <code>method</code> <p>method to be used for replacement. Options:</p> <p>'carry forward':     takes the previous value, and carries forward into fields     where values are missing.     +: quick. Realistic on time series.     -: Can produce strange outliers.</p> <p>'mean':     calculates the column mean (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: doesn't work on text. Causes data set to drift towards the mean.</p> <p>'mode':     calculates the column mode (exclude <code>missing</code>) and copies     the mean in as replacement.     +: quick     -: most frequent value becomes over-represented in the sample</p> <p>'nearest neighbour':     calculates normalised distance between items in source columns     selects nearest neighbour and copies value as replacement.     +: works for any datatype.     -: computationally intensive (e.g. slow)</p> <p> TYPE: <code>str</code> DEFAULT: <code>'carry forward'</code> </p> <code>sources</code> <p>NEAREST NEIGHBOUR ONLY column names to be used during imputation. if None or empty, all columns will be used.</p> <p> TYPE: <code>list of strings</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>table</code> <p>table with replaced values.</p> Source code in <code>tablite/imputation.py</code> <pre><code>def imputation(T, targets, missing=None, method=\"carry forward\", sources=None, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    In statistics, imputation is the process of replacing missing data with substituted values.\n\n    See more: https://en.wikipedia.org/wiki/Imputation_(statistics)\n\n    Args:\n        table (Table): source table.\n\n        targets (str or list of strings): column names to find and\n            replace missing values\n\n        missing (None or iterable): values to be replaced.\n\n        method (str): method to be used for replacement. Options:\n\n            'carry forward':\n                takes the previous value, and carries forward into fields\n                where values are missing.\n                +: quick. Realistic on time series.\n                -: Can produce strange outliers.\n\n            'mean':\n                calculates the column mean (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: doesn't work on text. Causes data set to drift towards the mean.\n\n            'mode':\n                calculates the column mode (exclude `missing`) and copies\n                the mean in as replacement.\n                +: quick\n                -: most frequent value becomes over-represented in the sample\n\n            'nearest neighbour':\n                calculates normalised distance between items in source columns\n                selects nearest neighbour and copies value as replacement.\n                +: works for any datatype.\n                -: computationally intensive (e.g. slow)\n\n        sources (list of strings): NEAREST NEIGHBOUR ONLY\n            column names to be used during imputation.\n            if None or empty, all columns will be used.\n\n    Returns:\n        table: table with replaced values.\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if isinstance(targets, str) and targets not in T.columns:\n        targets = [targets]\n    if isinstance(targets, list):\n        for name in targets:\n            if not isinstance(name, str):\n                raise TypeError(f\"expected str, not {type(name)}\")\n            if name not in T.columns:\n                raise ValueError(f\"target item {name} not a column name in T.columns:\\n{T.columns}\")\n    else:\n        raise TypeError(\"Expected source as list of column names\")\n\n    if missing is None:\n        missing = {None}\n    else:\n        missing = set(missing)\n\n    if method == \"nearest neighbour\":\n        if sources in (None, []):\n            sources = list(T.columns)\n        if isinstance(sources, str):\n            sources = [sources]\n        if isinstance(sources, list):\n            for name in sources:\n                if not isinstance(name, str):\n                    raise TypeError(f\"expected str, not {type(name)}\")\n                if name not in T.columns:\n                    raise ValueError(f\"source item {name} not a column name in T.columns:\\n{T.columns}\")\n        else:\n            raise TypeError(\"Expected source as list of column names\")\n\n    methods = [\"nearest neighbour\", \"mean\", \"mode\", \"carry forward\"]\n\n    if method == \"carry forward\":\n        return carry_forward(T, targets, missing, tqdm=tqdm, pbar=pbar)\n    elif method in {\"mean\", \"mode\"}:\n        return stats_method(T, targets, missing, method, tqdm=tqdm, pbar=pbar)\n    elif method == \"nearest neighbour\":\n        return nearest_neighbour(T, sources, missing, targets, tqdm=tqdm)\n    else:\n        raise ValueError(f\"method {method} not recognised amonst known methods: {list(methods)})\")\n</code></pre>"},{"location":"reference/imputation/#tablite.imputation.carry_forward","title":"<code>tablite.imputation.carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/imputation.py</code> <pre><code>def carry_forward(T, targets, missing, tqdm=_tqdm, pbar=None):\n    assert isinstance(missing, set)\n\n    if pbar is None:\n        total = len(targets) * len(T)\n        pbar = tqdm(total=total, desc=\"imputation.carry_forward\", disable=Config.TQDM_DISABLE)\n\n    new = T.copy()\n    for name in T.columns:\n        if name in targets:\n            data = T[name][:]  # create copy\n            last_value = None\n            for ix, v in enumerate(data):\n                if v in missing:  # perform replacement\n                    data[ix] = last_value\n                else:  # keep last value.\n                    last_value = v\n                pbar.update(1)\n            new[name] = data\n        else:\n            new[name] = T[name]\n\n    return new\n</code></pre>"},{"location":"reference/imputation/#tablite.imputation.stats_method","title":"<code>tablite.imputation.stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/imputation.py</code> <pre><code>def stats_method(T, targets, missing, method, tqdm=_tqdm, pbar=None):\n    assert isinstance(missing, set)\n\n    if pbar is None:\n        total = len(targets)\n        pbar = tqdm(total=total, desc=f\"imputation.{method}\", disable=Config.TQDM_DISABLE)\n\n    new = T.copy()\n    for name in T.columns:\n        if name in targets:\n            col = T.columns[name]\n            assert isinstance(col, Column)\n\n            hist_values, hist_counts = col.histogram()\n\n            for m in missing:\n                try:\n                    idx = hist_values.index(m)\n                    hist_counts[idx] = 0\n                except ValueError:\n                    pass\n\n            stats = summary_statistics(hist_values, hist_counts)\n\n            new_value = stats[method]\n            col.replace(mapping={m: new_value for m in missing})\n            new[name] = col\n            pbar.update(1)\n        else:\n            new[name] = T[name]  # no entropy, keep as is.\n\n    return new\n</code></pre>"},{"location":"reference/imputation/#tablite.imputation-modules","title":"Modules","text":""},{"location":"reference/joins/","title":"Joins","text":""},{"location":"reference/joins/#tablite.joins","title":"<code>tablite.joins</code>","text":""},{"location":"reference/joins/#tablite.joins-classes","title":"Classes","text":""},{"location":"reference/joins/#tablite.joins-functions","title":"Functions","text":""},{"location":"reference/joins/#tablite.joins.join","title":"<code>tablite.joins.join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], kind: str = 'inner', merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"<p>short-cut for all join functions.</p> PARAMETER  DESCRIPTION <code>T</code> <p>left table</p> <p> TYPE: <code>Table</code> </p> <code>other</code> <p>right table</p> <p> TYPE: <code>Table</code> </p> <code>left_keys</code> <p>list of keys for the join from left table.</p> <p> TYPE: <code>list</code> </p> <code>right_keys</code> <p>list of keys for the join from right table.</p> <p> TYPE: <code>list</code> </p> <code>left_columns</code> <p>list of columns names to retain from left table. If None, all are retained.</p> <p> TYPE: <code>list</code> </p> <code>right_columns</code> <p>list of columns names to retain from right table. If None, all are retained.</p> <p> TYPE: <code>list</code> </p> <code>kind</code> <p>'inner', 'left', 'outer', 'cross'. Defaults to \"inner\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'inner'</code> </p> <code>tqdm</code> <p>tqdm progress counter. Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> <code>pbar</code> <p>tqdm.progressbar. Defaults to None.</p> <p> TYPE: <code>pbar</code> DEFAULT: <code>None</code> </p> RAISES DESCRIPTION <code>ValueError</code> <p>if join type is unknown.</p> RETURNS DESCRIPTION <code>Table</code> <p>joined table.</p> <p>Example: \"inner\"</p> <pre><code>SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n</code></pre> <p>Tablite: </p> <pre><code>&gt;&gt;&gt; inner_join = numbers.inner_join(\n    letters, \n    left_keys=['colour'], \n    right_keys=['color'], \n    left_columns=['number'], \n    right_columns=['letter']\n)\n</code></pre> <p>Example: \"left\" </p> <pre><code>SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n</code></pre> <p>Tablite: </p> <pre><code>&gt;&gt;&gt; left_join = numbers.left_join(\n    letters, \n    left_keys=['colour'], \n    right_keys=['color'], \n    left_columns=['number'], \n    right_columns=['letter']\n)\n</code></pre> <p>Example: \"outer\"</p> <pre><code>SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n</code></pre> <p>Tablite: </p> <pre><code>&gt;&gt;&gt; outer_join = numbers.outer_join(\n    letters, \n    left_keys=['colour'], \n    right_keys=['color'], \n    left_columns=['number'], \n    right_columns=['letter']\n    )\n</code></pre> <p>Example: \"cross\"</p> <p>CROSS JOIN returns the Cartesian product of rows from tables in the join. In other words, it will produce rows which combine each row from the first table with each row from the second table</p> Source code in <code>tablite/joins.py</code> <pre><code>def join(\n    T: BaseTable,\n    other: BaseTable,\n    left_keys: List[str],\n    right_keys: List[str],\n    left_columns: Union[List[str], None],\n    right_columns: Union[List[str], None],\n    kind: str = \"inner\",\n    merge_keys: bool = False,\n    tqdm=_tqdm,\n    pbar=None,\n):\n    \"\"\"short-cut for all join functions.\n\n    Args:\n        T (Table): left table\n        other (Table): right table\n        left_keys (list): list of keys for the join from left table.\n        right_keys (list): list of keys for the join from right table.\n        left_columns (list): list of columns names to retain from left table.\n            If None, all are retained.\n        right_columns (list): list of columns names to retain from right table.\n            If None, all are retained.\n        kind (str, optional): 'inner', 'left', 'outer', 'cross'. Defaults to \"inner\".\n        tqdm (tqdm, optional): tqdm progress counter. Defaults to _tqdm.\n        pbar (tqdm.pbar, optional): tqdm.progressbar. Defaults to None.\n\n    Raises:\n        ValueError: if join type is unknown.\n\n    Returns:\n        Table: joined table.\n\n    Example: \"inner\"\n    ```\n    SQL:   SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color\n    ```\n    Tablite: \n    ```\n    &gt;&gt;&gt; inner_join = numbers.inner_join(\n        letters, \n        left_keys=['colour'], \n        right_keys=['color'], \n        left_columns=['number'], \n        right_columns=['letter']\n    )\n    ```\n\n    Example: \"left\" \n    ```\n    SQL:   SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color\n    ```\n    Tablite: \n    ```\n    &gt;&gt;&gt; left_join = numbers.left_join(\n        letters, \n        left_keys=['colour'], \n        right_keys=['color'], \n        left_columns=['number'], \n        right_columns=['letter']\n    )\n    ```\n\n    Example: \"outer\"\n    ```\n    SQL:   SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color\n    ```\n\n    Tablite: \n    ```\n    &gt;&gt;&gt; outer_join = numbers.outer_join(\n        letters, \n        left_keys=['colour'], \n        right_keys=['color'], \n        left_columns=['number'], \n        right_columns=['letter']\n        )\n    ```\n\n    Example: \"cross\"\n\n    CROSS JOIN returns the Cartesian product of rows from tables in the join.\n    In other words, it will produce rows which combine each row from the first table\n    with each row from the second table\n    \"\"\"\n    if left_columns is None:\n        left_columns = list(T.columns)\n    if right_columns is None:\n        right_columns = list(other.columns)\n    assert merge_keys in {True,False}\n\n    _jointype_check(T, other, left_keys, right_keys, left_columns, right_columns)\n\n    return _join(kind, T,other,left_keys, right_keys, left_columns, right_columns, merge_keys=merge_keys,\n             tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.inner_join","title":"<code>tablite.joins.inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def inner_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"inner\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.left_join","title":"<code>tablite.joins.left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def left_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"left\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.outer_join","title":"<code>tablite.joins.outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def outer_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"outer\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/joins/#tablite.joins.cross_join","title":"<code>tablite.joins.cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], left_columns: Union[List[str], None], right_columns: Union[List[str], None], merge_keys: bool = False, tqdm=_tqdm, pbar=None)</code>","text":"Source code in <code>tablite/joins.py</code> <pre><code>def cross_join(T: BaseTable, other: BaseTable, left_keys: List[str], right_keys: List[str], \n              left_columns: Union[List[str], None], right_columns: Union[List[str], None],\n              merge_keys: bool = False, tqdm=_tqdm, pbar=None):\n    return join(T, other, left_keys, right_keys, left_columns, right_columns, kind=\"cross\", merge_keys=merge_keys, tqdm=tqdm,pbar=pbar)\n</code></pre>"},{"location":"reference/lookup/","title":"Lookup","text":""},{"location":"reference/lookup/#tablite.lookup","title":"<code>tablite.lookup</code>","text":""},{"location":"reference/lookup/#tablite.lookup-attributes","title":"Attributes","text":""},{"location":"reference/lookup/#tablite.lookup-classes","title":"Classes","text":""},{"location":"reference/lookup/#tablite.lookup-functions","title":"Functions","text":""},{"location":"reference/lookup/#tablite.lookup.lookup","title":"<code>tablite.lookup.lookup(T, other, *criteria, all=True, tqdm=_tqdm)</code>","text":"<p>function for looking up values in <code>other</code> according to criteria in ascending order. :param: T: Table  :param: other: Table sorted in ascending search order. :param: criteria: Each criteria must be a tuple with value comparisons in the form:     (LEFT, OPERATOR, RIGHT) :param: all: boolean: True=ALL, False=ANY</p> <p>OPERATOR must be a callable that returns a boolean LEFT must be a value that the OPERATOR can compare. RIGHT must be a value that the OPERATOR can compare.</p> <p>Examples:</p> <p>comparison of two columns:</p> <pre><code>('column A', \"==\", 'column B')\n</code></pre> <p>compare value from column 'Date' with date 24/12.</p> <pre><code>('Date', \"&lt;\", DataTypes.date(24,12) )\n</code></pre> <p>uses custom function to compare value from column 'text 1' with value from column 'text 2'</p> <pre><code>f = lambda L,R: all( ord(L) &lt; ord(R) )\n('text 1', f, 'text 2')\n</code></pre> Source code in <code>tablite/lookup.py</code> <pre><code>def lookup(T, other, *criteria, all=True, tqdm=_tqdm):\n    \"\"\"function for looking up values in `other` according to criteria in ascending order.\n    :param: T: Table \n    :param: other: Table sorted in ascending search order.\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n        (LEFT, OPERATOR, RIGHT)\n    :param: all: boolean: True=ALL, False=ANY\n\n    OPERATOR must be a callable that returns a boolean\n    LEFT must be a value that the OPERATOR can compare.\n    RIGHT must be a value that the OPERATOR can compare.\n\n    Examples:\n        comparison of two columns:\n\n            ('column A', \"==\", 'column B')\n\n        compare value from column 'Date' with date 24/12.\n\n            ('Date', \"&lt;\", DataTypes.date(24,12) )\n\n        uses custom function to compare value from column\n        'text 1' with value from column 'text 2'\n\n            f = lambda L,R: all( ord(L) &lt; ord(R) )\n            ('text 1', f, 'text 2')\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    sub_cls_check(other, BaseTable)\n\n    all = all\n    any = not all\n\n    ops = lookup_ops\n\n    functions, left_criteria, right_criteria = [], set(), set()\n\n    for left, op, right in criteria:\n        left_criteria.add(left)\n        right_criteria.add(right)\n        if callable(op):\n            pass  # it's a custom function.\n        else:\n            op = ops.get(op, None)\n            if not callable(op):\n                raise ValueError(f\"{op} not a recognised operator for comparison.\")\n\n        functions.append((op, left, right))\n    left_columns = [n for n in left_criteria if n in T.columns]\n    right_columns = [n for n in right_criteria if n in other.columns]\n\n    result_index = np.empty(shape=(len(T)), dtype=np.int64)\n    cache = {}\n    left = T[left_columns]\n    Constr = type(T)\n    if isinstance(left, Column):\n        tmp, left = left, Constr()\n        left[left_columns[0]] = tmp\n    right = other[right_columns]\n    if isinstance(right, Column):\n        tmp, right = right, Constr()\n        right[right_columns[0]] = tmp\n    assert isinstance(left, BaseTable)\n    assert isinstance(right, BaseTable)\n\n    for ix, row1 in tqdm(enumerate(left.rows), total=len(T), disable=Config.TQDM_DISABLE):\n        row1_tup = tuple(row1)\n        row1d = {name: value for name, value in zip(left_columns, row1)}\n        row1_hash = hash(row1_tup)\n\n        match_found = True if row1_hash in cache else False\n\n        if not match_found:  # search.\n            for row2ix, row2 in enumerate(right.rows):\n                row2d = {name: value for name, value in zip(right_columns, row2)}\n\n                evaluations = {op(row1d.get(left, left), row2d.get(right, right)) for op, left, right in functions}\n                # The evaluations above does a neat trick:\n                # as L is a dict, L.get(left, L) will return a value\n                # from the columns IF left is a column name. If it isn't\n                # the function will treat left as a value.\n                # The same applies to right.\n                all_ = all and (False not in evaluations)\n                any_ = any and True in evaluations\n                if all_ or any_:\n                    match_found = True\n                    cache[row1_hash] = row2ix\n                    break\n\n        if not match_found:  # no match found.\n            cache[row1_hash] = -1  # -1 is replacement for None in the index as numpy can't handle Nones.\n\n        result_index[ix] = cache[row1_hash]\n\n    f = select_processing_method(2 * max(len(T), len(other)), _sp_lookup, _mp_lookup)\n    return f(T, other, result_index)\n</code></pre>"},{"location":"reference/match/","title":"Match","text":""},{"location":"reference/match/#tablite.match","title":"<code>tablite.match</code>","text":""},{"location":"reference/match/#tablite.match-classes","title":"Classes","text":""},{"location":"reference/match/#tablite.match-functions","title":"Functions","text":""},{"location":"reference/match/#tablite.match.match","title":"<code>tablite.match.match(T, other, *criteria, keep_left=None, keep_right=None)</code>","text":"<p>performs inner join where <code>T</code> matches <code>other</code> and removes rows that do not match.</p> <p>:param: T: Table :param: other: Table :param: criteria: Each criteria must be a tuple with value comparisons in the form:</p> <pre><code>(LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\nExample:\n    ('column A', \"==\", 'column B')\n\nThis syntax follows the lookup syntax. See Lookup for details.\n</code></pre> <p>:param: keep_left: list of columns to keep. :param: keep_right: list of right columns to keep.</p> Source code in <code>tablite/match.py</code> <pre><code>def match(T, other, *criteria, keep_left=None, keep_right=None):  # lookup and filter combined - drops unmatched rows.\n    \"\"\"\n    performs inner join where `T` matches `other` and removes rows that do not match.\n\n    :param: T: Table\n    :param: other: Table\n    :param: criteria: Each criteria must be a tuple with value comparisons in the form:\n\n        (LEFT, OPERATOR, RIGHT), where operator must be \"==\"\n\n        Example:\n            ('column A', \"==\", 'column B')\n\n        This syntax follows the lookup syntax. See Lookup for details.\n\n    :param: keep_left: list of columns to keep.\n    :param: keep_right: list of right columns to keep.\n    \"\"\"\n    assert isinstance(T, BaseTable)\n    assert isinstance(other, BaseTable)\n    if keep_left is None:\n        keep_left = [n for n in T.columns]\n    else:\n        type_check(keep_left, list)\n        name_check(T.columns, *keep_left)\n\n    if keep_right is None:\n        keep_right = [n for n in other.columns]\n    else:\n        type_check(keep_right, list)\n        name_check(other.columns, *keep_right)\n\n    indices = np.full(shape=(len(T),), fill_value=-1, dtype=np.int64)\n    for arg in criteria:\n        b,_,a = arg\n        if _ != \"==\":\n            raise ValueError(\"match requires A == B. For other logic visit `lookup`\")\n        if b not in T.columns:\n            raise ValueError(f\"Column {b} not found in T for criteria: {arg}\")\n        if a not in other.columns:\n            raise ValueError(f\"Column {a} not found in T for criteria: {arg}\")\n\n        index_update = find_indices(other[a][:], T[b][:], fill_value=-1)\n        indices = merge_indices(indices, index_update)\n\n    cls = type(T)\n    new = cls()\n    for name in T.columns:\n        if name in keep_left:\n            new[name] = np.compress(indices != -1, T[name][:])\n\n    for name in other.columns:\n        if name in keep_right:\n            new_name = unique_name(name, new.columns)\n            primary = np.compress(indices != -1, indices)\n            new[new_name] = np.take(other[name][:], primary)\n\n    return new\n</code></pre>"},{"location":"reference/match/#tablite.match.find_indices","title":"<code>tablite.match.find_indices(x, y, fill_value=-1)</code>","text":"<p>finds index of y in x</p> Source code in <code>tablite/match.py</code> <pre><code>def find_indices(x,y, fill_value=-1):  # fast.\n    \"\"\"\n    finds index of y in x\n    \"\"\"\n    # disassembly of numpy:\n    # import numpy as np\n    # x = np.array([3, 5, 7,  1,   9, 8, 6, 6])\n    # y = np.array([2, 1, 5, 10, 100, 6])\n    index = np.argsort(x)  # array([3, 0, 1, 6, 7, 2, 5, 4])\n    sorted_x = x[index]  # array([1, 3, 5, 6, 6, 7, 8, 9])\n    sorted_index = np.searchsorted(sorted_x, y)  # array([1, 0, 2, 8, 8, 3])\n    yindex = np.take(index, sorted_index, mode=\"clip\")  # array([0, 3, 1, 4, 4, 6])\n    mask = x[yindex] != y  # array([ True, False, False,  True,  True, False])\n    indices = np.ma.array(yindex, mask=mask, fill_value=fill_value)  \n    # masked_array(data=[--, 3, 1, --, --, 6], mask=[ True, False, False,  True,  True, False], fill_value=999999)\n    # --: y[0] not in x\n    # 3 : y[1] == x[3]\n    # 1 : y[2] == x[1]\n    # --: y[3] not in x\n    # --: y[4] not in x\n    # --: y[5] == x[6]\n    result = np.where(~indices.mask, indices.data, -1)  \n    return result  # array([-1,  3,  1, -1, -1,  6])\n</code></pre>"},{"location":"reference/match/#tablite.match.merge_indices","title":"<code>tablite.match.merge_indices(x1, *args, fill_value=-1)</code>","text":"<p>merges x1 and x2 where</p> Source code in <code>tablite/match.py</code> <pre><code>def merge_indices(x1, *args, fill_value=-1):\n    \"\"\"\n    merges x1 and x2 where \n    \"\"\"\n    # dis:\n    # &gt;&gt;&gt; AA = array([-1,  3, -1, 5])\n    # &gt;&gt;&gt; BB = array([-1, -1,  4, 5])\n    new = x1[:]  # = AA\n    for arg in args:\n        mask = (new == fill_value)  # array([True, False, True, False])\n        new = np.where(mask, arg, new)  # array([-1, 3, 4, 5])\n    return new   # array([-1, 3, 4, 5])\n</code></pre>"},{"location":"reference/merge/","title":"Merge","text":""},{"location":"reference/merge/#tablite.merge","title":"<code>tablite.merge</code>","text":""},{"location":"reference/merge/#tablite.merge-classes","title":"Classes","text":""},{"location":"reference/merge/#tablite.merge-functions","title":"Functions","text":""},{"location":"reference/merge/#tablite.merge.where","title":"<code>tablite.merge.where(T, criteria, left, right, new)</code>","text":"<p>takes from LEFT where criteria is True else RIGHT  and creates a single new column.</p> <p>:param: T: Table :param: criteria: np.array(bool):          if True take left column         else take right column :param left: (str) column name :param right: (str) column name :param new: (str) new name</p> <p>:returns: T</p> Source code in <code>tablite/merge.py</code> <pre><code>def where(T, criteria, left, right, new):\n    \"\"\" takes from LEFT where criteria is True else RIGHT \n    and creates a single new column.\n\n    :param: T: Table\n    :param: criteria: np.array(bool): \n            if True take left column\n            else take right column\n    :param left: (str) column name\n    :param right: (str) column name\n    :param new: (str) new name\n\n    :returns: T\n    \"\"\"\n    type_check(T, BaseTable)\n    if isinstance(criteria, np.ndarray):\n        if not criteria.dtype == \"bool\":\n            raise TypeError\n    else:\n        criteria = np.array(criteria, dtype='bool')\n\n    new_uq = unique_name(new, list(T.columns))\n    T.add_column(new_uq)\n    col = T[new_uq]\n\n    for start,end in Config.page_steps(len(criteria)):\n        left_values = T[left][start:end]\n        right_values = T[right][start:end]\n        new_values = np.where(criteria, left_values, right_values)\n        col.extend(new_values)\n\n    if new == right:\n        T[right] = T[new_uq]  # keep column order\n        del T[new_uq]\n        del T[left]\n    elif new == left:\n        T[left] = T[new_uq]  # keep column order\n        del T[new_uq]\n        del T[right]\n    else:\n        T[new] = T[new_uq]\n        del T[left]\n        del T[right]\n    return T\n</code></pre>"},{"location":"reference/mp_utils/","title":"Mp utils","text":""},{"location":"reference/mp_utils/#tablite.mp_utils","title":"<code>tablite.mp_utils</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-attributes","title":"Attributes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.lookup_ops","title":"<code>tablite.mp_utils.lookup_ops = {'in': _in, 'not in': not_in, '&lt;': operator.lt, '&lt;=': operator.le, '&gt;': operator.gt, '&gt;=': operator.ge, '!=': operator.ne, '==': operator.eq}</code>  <code>module-attribute</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops","title":"<code>tablite.mp_utils.filter_ops = {'&gt;': operator.gt, '&gt;=': operator.ge, '==': operator.eq, '&lt;': operator.lt, '&lt;=': operator.le, '!=': operator.ne, 'in': _in}</code>  <code>module-attribute</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.filter_ops_from_text","title":"<code>tablite.mp_utils.filter_ops_from_text = {'gt': '&gt;', 'gteq': '&gt;=', 'eq': '==', 'lt': '&lt;', 'lteq': '&lt;=', 'neq': '!=', 'in': _in}</code>  <code>module-attribute</code>","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-classes","title":"Classes","text":""},{"location":"reference/mp_utils/#tablite.mp_utils-functions","title":"Functions","text":""},{"location":"reference/mp_utils/#tablite.mp_utils.not_in","title":"<code>tablite.mp_utils.not_in(a, b)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def not_in(a, b):\n    return not operator.contains(str(a), str(b))\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.is_mp","title":"<code>tablite.mp_utils.is_mp(fields: int) -&gt; bool</code>","text":"PARAMETER  DESCRIPTION <code>fields</code> <p>number of fields</p> <p> TYPE: <code>int</code> </p> RETURNS DESCRIPTION <code>bool</code> <p>bool</p> Source code in <code>tablite/mp_utils.py</code> <pre><code>def is_mp(fields: int) -&gt; bool:\n    \"\"\"\n\n    Args:\n        fields (int): number of fields\n\n    Returns:\n        bool\n    \"\"\"\n    if Config.MULTIPROCESSING_MODE == Config.FORCE:\n        return True\n\n    if Config.MULTIPROCESSING_MODE == Config.FALSE:\n        return False\n\n    if fields &lt; Config.SINGLE_PROCESSING_LIMIT:\n        return False\n\n    if max(psutil.cpu_count(logical=False), 1) &lt; 2:\n        return False\n\n    return True\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.select_processing_method","title":"<code>tablite.mp_utils.select_processing_method(fields, sp, mp)</code>","text":"PARAMETER  DESCRIPTION <code>fields</code> <p>number of fields</p> <p> TYPE: <code>int</code> </p> <code>sp</code> <p>method for single processing</p> <p> TYPE: <code>callable</code> </p> <code>mp</code> <p>method for multiprocessing</p> <p> TYPE: <code>callable</code> </p> RETURNS DESCRIPTION <code>_type_</code> <p>description</p> Source code in <code>tablite/mp_utils.py</code> <pre><code>def select_processing_method(fields, sp, mp):\n    \"\"\"\n\n    Args:\n        fields (int): number of fields\n        sp (callable): method for single processing\n        mp (callable): method for multiprocessing\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    return mp if is_mp(fields) else sp\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.maskify","title":"<code>tablite.mp_utils.maskify(arr)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def maskify(arr):\n    none_mask = [False] * len(arr)  # Setting the default\n\n    for i in range(len(arr)):\n        if arr[i] is None:  # Check if our value is None\n            none_mask[i] = True\n            arr[i] = 0  # Remove None from the original array\n\n    return none_mask\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.share_mem","title":"<code>tablite.mp_utils.share_mem(inp_arr, dtype)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def share_mem(inp_arr, dtype):\n    len_ = len(inp_arr)\n    size = np.dtype(dtype).itemsize * len_\n    shape = (len_,)\n\n    out_shm = shared_memory.SharedMemory(create=True, size=size)  # the co_processors will read this.\n    out_arr_index = np.ndarray(shape, dtype=dtype, buffer=out_shm.buf)\n    out_arr_index[:] = inp_arr\n\n    return out_arr_index, out_shm\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.map_task","title":"<code>tablite.mp_utils.map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def map_task(data_shm_name, index_shm_name, destination_shm_name, shape, dtype, start, end):\n    # connect\n    shared_data = shared_memory.SharedMemory(name=data_shm_name)\n    data = np.ndarray(shape, dtype=dtype, buffer=shared_data.buf)\n\n    shared_index = shared_memory.SharedMemory(name=index_shm_name)\n    index = np.ndarray(shape, dtype=np.int64, buffer=shared_index.buf)\n\n    shared_target = shared_memory.SharedMemory(name=destination_shm_name)\n    target = np.ndarray(shape, dtype=dtype, buffer=shared_target.buf)\n    # work\n    target[start:end] = np.take(data[start:end], index[start:end])\n    # disconnect\n    shared_data.close()\n    shared_index.close()\n    shared_target.close()\n</code></pre>"},{"location":"reference/mp_utils/#tablite.mp_utils.reindex_task","title":"<code>tablite.mp_utils.reindex_task(src, dst, index_shm, shm_shape, start, end)</code>","text":"Source code in <code>tablite/mp_utils.py</code> <pre><code>def reindex_task(src, dst, index_shm, shm_shape, start, end):\n    # connect\n    existing_shm = shared_memory.SharedMemory(name=index_shm)\n    shared_index = np.ndarray(shm_shape, dtype=np.int64, buffer=existing_shm.buf)\n    # work\n    array = load_numpy(src)\n    new = np.take(array, shared_index[start:end])\n    np.save(dst, new, allow_pickle=True, fix_imports=False)\n    # disconnect\n    existing_shm.close()\n</code></pre>"},{"location":"reference/nimlite/","title":"Nimlite","text":""},{"location":"reference/nimlite/#tablite.nimlite","title":"<code>tablite.nimlite</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite-attributes","title":"Attributes","text":""},{"location":"reference/nimlite/#tablite.nimlite.paths","title":"<code>tablite.nimlite.paths = sys.argv[:]</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.K","title":"<code>tablite.nimlite.K = TypeVar('K', bound=BaseTable)</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidEncoders","title":"<code>tablite.nimlite.ValidEncoders = Literal['ENC_UTF8', 'ENC_UTF16', 'ENC_WIN1250']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidQuoting","title":"<code>tablite.nimlite.ValidQuoting = Literal['QUOTE_MINIMAL', 'QUOTE_ALL', 'QUOTE_NONNUMERIC', 'QUOTE_NONE', 'QUOTE_STRINGS', 'QUOTE_NOTNULL']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ValidSkipEmpty","title":"<code>tablite.nimlite.ValidSkipEmpty = Literal['NONE', 'ANY', 'ALL']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.ColumnSelectorDict","title":"<code>tablite.nimlite.ColumnSelectorDict = TypedDict('ColumnSelectorDict', {'column': str, 'type': Literal['int', 'float', 'bool', 'str', 'date', 'time', 'datetime'], 'allow_empty': Union[bool, None], 'rename': Union[str, None]})</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterCriteria","title":"<code>tablite.nimlite.FilterCriteria = Literal['&gt;', '&gt;=', '==', '&lt;', '&lt;=', '!=', 'in']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterType","title":"<code>tablite.nimlite.FilterType = Literal['all', 'any']</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite.FilterDict","title":"<code>tablite.nimlite.FilterDict = TypedDict('FilterDict', {'column1': str, 'value1': Union[str, None], 'criteria': FilterCriteria, 'column2': str, 'value2': Union[str, None]})</code>  <code>module-attribute</code>","text":""},{"location":"reference/nimlite/#tablite.nimlite-classes","title":"Classes","text":""},{"location":"reference/nimlite/#tablite.nimlite-functions","title":"Functions","text":""},{"location":"reference/nimlite/#tablite.nimlite.get_headers","title":"<code>tablite.nimlite.get_headers(path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, header_row_index: int = 0, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, linecount: int = 10) -&gt; list[list[str]]</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def get_headers(\n    path: Union[str, Path],\n    encoding: ValidEncoders =\"ENC_UTF8\",\n    *,\n    header_row_index: int=0,\n    newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n    quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True,\n    linecount: int = 10\n) -&gt; list[list[str]]:\n    return nl.get_headers(\n            path=str(path),\n            encoding=encoding,\n            newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n            strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n            header_row_index=header_row_index,\n            quoting=quoting,\n            linecount=linecount\n        )\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.text_reader","title":"<code>tablite.nimlite.text_reader(T: Type[K], pid: str, path: Union[str, Path], encoding: ValidEncoders = 'ENC_UTF8', *, first_row_has_headers: bool = True, header_row_index: int = 0, columns: List[Union[str, None]] = None, start: Union[str, None] = None, limit: Union[str, None] = None, guess_datatypes: bool = False, newline: str = '\\n', delimiter: str = ',', text_qualifier: str = '\"', quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool = True, skip_empty: ValidSkipEmpty = 'NONE', tqdm=_tqdm) -&gt; K</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def text_reader(\n    T: Type[K],\n    pid: str, path: Union[str, Path],\n    encoding: ValidEncoders =\"ENC_UTF8\",\n    *,\n    first_row_has_headers: bool=True, header_row_index: int=0,\n    columns: List[Union[str, None]]=None,\n    start: Union[str, None] = None, limit: Union[str, None]=None,\n    guess_datatypes: bool =False,\n    newline: str='\\n', delimiter: str=',', text_qualifier: str='\"',\n    quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True, skip_empty: ValidSkipEmpty = \"NONE\",\n    tqdm=_tqdm\n) -&gt; K:\n    assert isinstance(path, Path)\n    assert isinstance(pid, Path)\n    with tqdm(total=10, desc=f\"importing file\") as pbar:\n        table = nl.text_reader(\n            pid=str(pid),\n            path=str(path),\n            encoding=encoding,\n            first_row_has_headers=first_row_has_headers, header_row_index=header_row_index,\n            columns=columns,\n            start=start, limit=limit,\n            guess_datatypes=guess_datatypes,\n            newline=newline, delimiter=delimiter, text_qualifier=text_qualifier,\n            quoting=quoting,\n            strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,\n            skip_empty=skip_empty,\n            page_size=Config.PAGE_SIZE\n        )\n\n        pbar.update(1)\n\n        task_info = table[\"task\"]\n        task_columns = table[\"columns\"]\n\n        ti_tasks = task_info[\"tasks\"]\n        ti_import_field_names = task_info[\"import_field_names\"]\n\n        is_windows = platform.system() == \"Windows\"\n        use_logical = False if is_windows else True\n\n        cpus = max(psutil.cpu_count(logical=use_logical), 1)\n\n        pbar_step = 4 / max(len(ti_tasks), 1)\n\n        class WrapUpdate:\n            def update(self, n):\n                pbar.update(n * pbar_step)\n\n        wrapped_pbar = WrapUpdate()\n\n        def next_task(task: Task, page_info):\n            wrapped_pbar.update(1)\n            return Task(\n                nl.text_reader_task,\n                *task.args, **task.kwargs, page_info=page_info\n            )\n\n        tasks = [\n            TaskChain(\n                Task(\n                    nl.collect_text_reader_page_info_task,\n                    task=t,\n                    task_info=task_info\n                ), next_task=next_task\n            ) for t in ti_tasks\n        ]\n\n        is_sp = False\n\n        if Config.MULTIPROCESSING_MODE == Config.FALSE:\n            is_sp = True\n        elif Config.MULTIPROCESSING_MODE == Config.FORCE:\n            is_sp = False\n        elif Config.MULTIPROCESSING_MODE == Config.AUTO and cpus &lt;= 1 or len(tasks) &lt;= 1:\n            is_sp = True\n\n        if is_sp:\n            res = []\n\n            for task in tasks:\n                page = task.execute()\n\n                res.append(page)\n        else:\n            with TaskManager(cpus, error_mode=\"exception\") as tm:\n                res = tm.execute(tasks, pbar=wrapped_pbar)\n\n        col_path = pid\n        column_dict = {\n            cols: Column(col_path)\n            for cols in ti_import_field_names\n        }\n\n        for res_pages in res:\n            col_map = {\n                n: res_pages[i]\n                for i, n in enumerate(ti_import_field_names)\n            }\n\n            for k, c in column_dict.items():\n                c.pages.append(col_map[k])\n\n        if columns is None:\n            columns = [c[\"name\"] for c in task_columns]\n\n        table_dict = {\n            a[\"name\"]: column_dict[b]\n            for a, b in zip(task_columns, columns)\n        }\n\n        pbar.update(pbar.total - pbar.n)\n\n        table = T(columns=table_dict)\n\n    return table\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.wrap","title":"<code>tablite.nimlite.wrap(str_: str) -&gt; str</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def wrap(str_: str) -&gt; str:\n    return '\"' + str_.replace('\"', '\\\\\"').replace(\"'\", \"\\\\'\").replace(\"\\n\", \"\\\\n\").replace(\"\\t\", \"\\\\t\") + '\"'\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.column_select","title":"<code>tablite.nimlite.column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -&gt; Tuple[K, K]</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def column_select(table: K, cols: list[ColumnSelectorDict], tqdm=_tqdm, TaskManager=TaskManager) -&gt; Tuple[K, K]:\n    with tqdm(total=100, desc=\"column select\", bar_format='{desc}: {percentage:.1f}%|{bar}{r_bar}') as pbar:\n        T = type(table)\n        dir_pid = Config.workdir / Config.pid\n\n        col_infos = nl.collect_column_select_info(table, cols, str(dir_pid), pbar)\n\n        columns = col_infos[\"columns\"]\n        page_count = col_infos[\"page_count\"]\n        is_correct_type = col_infos[\"is_correct_type\"]\n        desired_column_map = col_infos[\"desired_column_map\"]\n        original_pages_map = col_infos[\"original_pages_map\"]\n        passed_column_data = col_infos[\"passed_column_data\"]\n        failed_column_data = col_infos[\"failed_column_data\"]\n        res_cols_pass = col_infos[\"res_cols_pass\"]\n        res_cols_fail = col_infos[\"res_cols_fail\"]\n        column_names = col_infos[\"column_names\"]\n        reject_reason_name = col_infos[\"reject_reason_name\"]\n\n        if all(is_correct_type.values()):\n            tbl_pass_columns = {\n                desired_name: table[desired_info[0]]\n                for desired_name, desired_info in desired_column_map.items()\n            }\n\n            tbl_fail_columns = {\n                desired_name: []\n                for desired_name in failed_column_data\n            }\n\n            tbl_pass = T(columns=tbl_pass_columns)\n            tbl_fail = T(columns=tbl_fail_columns)\n\n            return (tbl_pass, tbl_fail)\n\n        task_list_inp = (\n            _collect_cs_info(i, columns, res_cols_pass, res_cols_fail, original_pages_map)\n            for i in range(page_count)\n        )\n\n        page_size = Config.PAGE_SIZE\n\n        tasks = (\n            Task(\n                nl.do_slice_convert, str(dir_pid), page_size, columns, reject_reason_name, res_pass, res_fail, desired_column_map, column_names, is_correct_type\n            )\n            for columns, res_pass, res_fail in task_list_inp\n        )\n\n        cpu_count = max(psutil.cpu_count(), 1)\n\n        if Config.MULTIPROCESSING_MODE == Config.FORCE:\n            is_mp = True\n        elif Config.MULTIPROCESSING_MODE == Config.FALSE:\n            is_mp = False\n        elif Config.MULTIPROCESSING_MODE == Config.AUTO:\n            is_multithreaded = cpu_count &gt; 1\n            is_multipage = page_count &gt; 1\n\n            is_mp = is_multithreaded and is_multipage\n\n        tbl_pass = T({k: [] for k in passed_column_data})\n        tbl_fail = T({k: [] for k in failed_column_data})\n\n        converted = []\n        step_size = 45 / max(page_count, 1)\n\n        if is_mp:\n            class WrapUpdate:\n                def update(self, n):\n                    pbar.update(n * step_size)\n\n            with TaskManager(min(cpu_count, page_count), error_mode=\"exception\") as tm:\n                res = tm.execute(list(tasks), pbar=WrapUpdate())\n\n                converted.extend(res)\n        else:\n            for task in tasks:\n                res = task.f(*task.args, **task.kwargs)\n\n                converted.append(res)\n                pbar.update(step_size)\n\n        def extend_table(table, columns):\n            for (col_name, pg) in columns:\n                table[col_name].pages.append(pg)\n\n        for pg_pass, pg_fail in converted:\n            extend_table(tbl_pass, pg_pass)\n            extend_table(tbl_fail, pg_fail)\n\n        pbar.update(pbar.total - pbar.n)\n\n        return tbl_pass, tbl_fail\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.read_page","title":"<code>tablite.nimlite.read_page(path: Union[str, Path]) -&gt; np.ndarray</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def read_page(path: Union[str, Path]) -&gt; np.ndarray:\n    return nl.read_page(str(path))\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.repaginate","title":"<code>tablite.nimlite.repaginate(column: Column)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def repaginate(column: Column):\n    nl.repaginate(column)\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.nearest_neighbour","title":"<code>tablite.nimlite.nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def nearest_neighbour(T: BaseTable, sources: Union[list[str], None], missing: Union[list, None], targets: Union[list[str], None], tqdm=_tqdm):\n    return nl.nearest_neighbour(T, sources, list(missing), targets, tqdm)\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.groupby","title":"<code>tablite.nimlite.groupby(T, keys, functions, tqdm=_tqdm)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def groupby(T, keys, functions, tqdm=_tqdm):\n    return nl.groupby(T, keys, functions, tqdm)\n</code></pre>"},{"location":"reference/nimlite/#tablite.nimlite.filter","title":"<code>tablite.nimlite.filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm=_tqdm)</code>","text":"Source code in <code>tablite/nimlite.py</code> <pre><code>def filter(table: BaseTable, expressions: list[FilterDict], type: FilterType, tqdm = _tqdm):\n    return nl.filter(table, expressions, type, tqdm)\n</code></pre>"},{"location":"reference/pivots/","title":"Pivots","text":""},{"location":"reference/pivots/#tablite.pivots","title":"<code>tablite.pivots</code>","text":""},{"location":"reference/pivots/#tablite.pivots-classes","title":"Classes","text":""},{"location":"reference/pivots/#tablite.pivots-functions","title":"Functions","text":""},{"location":"reference/pivots/#tablite.pivots.pivot","title":"<code>tablite.pivots.pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None)</code>","text":"<p>param: rows: column names to keep as rows param: columns: column names to keep as columns param: functions: aggregation functions from the Groupby class as</p> <p>example:</p> <pre><code>&gt;&gt;&gt; t.show()\n+=====+=====+=====+\n|  A  |  B  |  C  |\n| int | int | int |\n+-----+-----+-----+\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n|    1|    1|    6|\n|    1|    2|    5|\n|    2|    3|    4|\n|    2|    4|    3|\n|    3|    5|    2|\n|    3|    6|    1|\n+=====+=====+=====+\n\n&gt;&gt;&gt; t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n&gt;&gt;&gt; t2.show()\n+===+===+========+=====+=====+=====+\n| # | C |function|(A=1)|(A=2)|(A=3)|\n|row|int|  str   |mixed|mixed|mixed|\n+---+---+--------+-----+-----+-----+\n|0  |  6|Sum(B)  |    2|None |None |\n|1  |  5|Sum(B)  |    4|None |None |\n|2  |  4|Sum(B)  |None |    6|None |\n|3  |  3|Sum(B)  |None |    8|None |\n|4  |  2|Sum(B)  |None |None |   10|\n|5  |  1|Sum(B)  |None |None |   12|\n+===+===+========+=====+=====+=====+\n</code></pre> Source code in <code>tablite/pivots.py</code> <pre><code>def pivot(T, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):\n    \"\"\"\n    param: rows: column names to keep as rows\n    param: columns: column names to keep as columns\n    param: functions: aggregation functions from the Groupby class as\n\n    example:\n    ```\n    &gt;&gt;&gt; t.show()\n    +=====+=====+=====+\n    |  A  |  B  |  C  |\n    | int | int | int |\n    +-----+-----+-----+\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    |    1|    1|    6|\n    |    1|    2|    5|\n    |    2|    3|    4|\n    |    2|    4|    3|\n    |    3|    5|    2|\n    |    3|    6|    1|\n    +=====+=====+=====+\n\n    &gt;&gt;&gt; t2 = t.pivot(rows=['C'], columns=['A'], functions=[('B', gb.sum)])\n    &gt;&gt;&gt; t2.show()\n    +===+===+========+=====+=====+=====+\n    | # | C |function|(A=1)|(A=2)|(A=3)|\n    |row|int|  str   |mixed|mixed|mixed|\n    +---+---+--------+-----+-----+-----+\n    |0  |  6|Sum(B)  |    2|None |None |\n    |1  |  5|Sum(B)  |    4|None |None |\n    |2  |  4|Sum(B)  |None |    6|None |\n    |3  |  3|Sum(B)  |None |    8|None |\n    |4  |  2|Sum(B)  |None |None |   10|\n    |5  |  1|Sum(B)  |None |None |   12|\n    +===+===+========+=====+=====+=====+\n    ```\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if isinstance(rows, str):\n        rows = [rows]\n    if not all(isinstance(i, str) for i in rows):\n        raise TypeError(f\"Expected rows as a list of column names, not {[i for i in rows if not isinstance(i,str)]}\")\n\n    if isinstance(columns, str):\n        columns = [columns]\n    if not all(isinstance(i, str) for i in columns):\n        raise TypeError(\n            f\"Expected columns as a list of column names, not {[i for i in columns if not isinstance(i, str)]}\"\n        )\n\n    if not isinstance(values_as_rows, bool):\n        raise TypeError(f\"expected sum_on_rows as boolean, not {type(values_as_rows)}\")\n\n    keys = rows + columns\n    assert isinstance(keys, list)\n\n    extra_steps = 2\n\n    if pbar is None:\n        total = extra_steps\n\n        if len(functions) == 0:\n            total = total + len(keys)\n        else:\n            total = total + len(T)\n\n        pbar = tqdm(total=total, desc=\"pivot\")\n\n    grpby = groupby(T, keys, functions, tqdm=tqdm)\n    Constr = type(T)\n\n    if len(grpby) == 0:  # return empty table. This must be a test?\n        pbar.update(extra_steps)\n        return Constr()\n\n    # split keys to determine grid dimensions\n    row_key_index = {}\n    col_key_index = {}\n\n    r = len(rows)\n    c = len(columns)\n    g = len(functions)\n\n    records = defaultdict(dict)\n\n    for row in grpby.rows:\n        row_key = tuple(row[:r])\n        col_key = tuple(row[r : r + c])\n        func_key = tuple(row[r + c :])\n\n        if row_key not in row_key_index:\n            row_key_index[row_key] = len(row_key_index)  # Y\n\n        if col_key not in col_key_index:\n            col_key_index[col_key] = len(col_key_index)  # X\n\n        rix = row_key_index[row_key]\n        cix = col_key_index[col_key]\n        if cix in records:\n            if rix in records[cix]:\n                raise ValueError(\"this should be empty.\")\n        records[cix][rix] = func_key\n\n    pbar.update(1)\n    result = type(T)()\n\n    if values_as_rows:  # ---&gt; leads to more rows.\n        # first create all columns left to right\n\n        n = r + 1  # rows keys + 1 col for function values.\n        cols = [[] for _ in range(n)]\n        for row, ix in row_key_index.items():\n            for col_name, f in functions:\n                cols[-1].append(f\"{f}({col_name})\")\n                for col_ix, v in enumerate(row):\n                    cols[col_ix].append(v)\n\n        for col_name, values in zip(rows + [\"function\"], cols):\n            col_name = unique_name(col_name, result.columns)\n            result[col_name] = values\n        col_length = len(cols[0])\n        cols.clear()\n\n        # then populate the sparse matrix.\n        for col_key, c in col_key_index.items():\n            col_name = \"(\" + \",\".join([f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)]) + \")\"\n            col_name = unique_name(col_name, result.columns)\n            L = [None for _ in range(col_length)]\n            for r, funcs in records[c].items():\n                for ix, f in enumerate(funcs):\n                    L[g * r + ix] = f\n            result[col_name] = L\n\n    else:  # ---&gt; leads to more columns.\n        n = r\n        cols = [[] for _ in range(n)]\n        for row in row_key_index:\n            for col_ix, v in enumerate(row):\n                cols[col_ix].append(v)  # write key columns.\n\n        for col_name, values in zip(rows, cols):\n            result[col_name] = values\n\n        col_length = len(row_key_index)\n\n        # now populate the sparse matrix.\n        for col_key, c in col_key_index.items():  # select column.\n            cols, names = [], []\n\n            for f, v in zip(functions, func_key):\n                agg_col, func = f\n                terms = \",\".join([agg_col] + [f\"{col_name}={value}\" for col_name, value in zip(columns, col_key)])\n                col_name = f\"{func}({terms})\"\n                col_name = unique_name(col_name, result.columns)\n                names.append(col_name)\n                cols.append([None for _ in range(col_length)])\n            for r, funcs in records[c].items():\n                for ix, f in enumerate(funcs):\n                    cols[ix][r] = f\n            for name, col in zip(names, cols):\n                result[name] = col\n\n    pbar.update(1)\n\n    return result\n</code></pre>"},{"location":"reference/pivots/#tablite.pivots.transpose","title":"<code>tablite.pivots.transpose(T, tqdm=_tqdm)</code>","text":"<p>performs a CCW matrix rotation of the table.</p> Source code in <code>tablite/pivots.py</code> <pre><code>def transpose(T, tqdm=_tqdm):\n    \"\"\"performs a CCW matrix rotation of the table.\"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if len(T.columns) == 0:\n        return type(T)()\n\n    assert isinstance(T, BaseTable)\n    new = type(T)()\n    L = list(T.columns)\n    new[L[0]] = L[1:]\n    for row in tqdm(T.rows, desc=\"table transpose\", total=len(T)):\n        new[row[0]] = row[1:]\n    return new\n</code></pre>"},{"location":"reference/pivots/#tablite.pivots.pivot_transpose","title":"<code>tablite.pivots.pivot_transpose(T, columns, keep=None, column_name='transpose', value_name='value', tqdm=_tqdm)</code>","text":"<p>Transpose a selection of columns to rows.</p> PARAMETER  DESCRIPTION <code>columns</code> <p>column names to transpose</p> <p> TYPE: <code>list of column names</code> </p> <code>keep</code> <p>column names to keep (repeat)</p> <p> TYPE: <code>list of column names</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>Table</code> <p>with columns transposed to rows</p> Example <p>transpose columns 1,2 and 3 and transpose the remaining columns, except <code>sum</code>.</p> <p>Input:</p> <pre><code>| col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n|------|------|------|-----|-----|-----|-----|-----|------|\n| 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n| 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n| ...  |      |      |     |     |     |     |     |      |\n\n&gt;&gt;&gt; t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\nOutput:\n|col1| col2| col3| transpose| value|\n|----|-----|-----|----------|------|\n|1234| 2345| 3456| sun      |   456|\n|1234| 2345| 3456| mon      |   567|\n|1244| 2445| 4456| mon      |     7|\n</code></pre> Source code in <code>tablite/pivots.py</code> <pre><code>def pivot_transpose(T, columns, keep=None, column_name=\"transpose\", value_name=\"value\", tqdm=_tqdm):\n    \"\"\"Transpose a selection of columns to rows.\n\n    Args:\n        columns (list of column names): column names to transpose\n        keep (list of column names): column names to keep (repeat)\n\n    Returns:\n        Table: with columns transposed to rows\n\n    Example:\n        transpose columns 1,2 and 3 and transpose the remaining columns, except `sum`.\n\n    Input:\n    ```\n    | col1 | col2 | col3 | sun | mon | tue | ... | sat | sum  |\n    |------|------|------|-----|-----|-----|-----|-----|------|\n    | 1234 | 2345 | 3456 | 456 | 567 |     | ... |     | 1023 |\n    | 1244 | 2445 | 4456 |     |   7 |     | ... |     |    7 |\n    | ...  |      |      |     |     |     |     |     |      |\n\n    &gt;&gt;&gt; t.transpose(keep=[col1, col2, col3], transpose=[sun,mon,tue,wed,thu,fri,sat])`\n\n    Output:\n    |col1| col2| col3| transpose| value|\n    |----|-----|-----|----------|------|\n    |1234| 2345| 3456| sun      |   456|\n    |1234| 2345| 3456| mon      |   567|\n    |1244| 2445| 4456| mon      |     7|\n    ```\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if not isinstance(columns, list):\n        raise TypeError\n\n    for i in columns:\n        if not isinstance(i, str):\n            raise TypeError\n        if i not in T.columns:\n            raise ValueError\n        if columns.count(i)&gt;1:\n            raise ValueError(f\"Column {i} appears more than once\")\n\n    if keep is None:\n        keep = []\n    for i in keep:\n        if not isinstance(i, str):\n            raise TypeError\n        if i not in T.columns:\n            raise ValueError\n\n    if column_name in keep + columns:\n        column_name = unique_name(column_name, set_of_names=keep + columns)\n    if value_name in keep + columns + [column_name]:\n        value_name = unique_name(value_name, set_of_names=keep + columns)\n\n    new = type(T)()\n    new.add_columns(*keep + [column_name, value_name])\n    news = {name: [] for name in new.columns}\n\n    n = len(keep)\n\n    with tqdm(total=len(T), desc=\"transpose\", disable=Config.TQDM_DISABLE) as pbar:\n        it = T[keep + columns].rows if len(keep + columns) &gt; 1 else ((v, ) for v in T[keep + columns])\n\n        for ix, row in enumerate(it, start=1):\n            keeps = row[:n]\n            transposes = row[n:]\n\n            for name, value in zip(keep, keeps):\n                news[name].extend([value] * len(transposes))\n            for name, value in zip(columns, transposes):\n                news[column_name].append(name)\n                news[value_name].append(value)\n\n            if ix % Config.SINGLE_PROCESSING_LIMIT == 0:\n                for name, values in news.items():\n                    new[name].extend(values)\n                    values.clear()\n\n            pbar.update(1)\n\n    for name, values in news.items():\n        new[name].extend(np.array(values))\n        values.clear()\n    return new\n</code></pre>"},{"location":"reference/redux/","title":"Redux","text":""},{"location":"reference/redux/#tablite.redux","title":"<code>tablite.redux</code>","text":""},{"location":"reference/redux/#tablite.redux-attributes","title":"Attributes","text":""},{"location":"reference/redux/#tablite.redux-classes","title":"Classes","text":""},{"location":"reference/redux/#tablite.redux-functions","title":"Functions","text":""},{"location":"reference/redux/#tablite.redux.filter_all","title":"<code>tablite.redux.filter_all(T, **kwargs)</code>","text":"<p>returns Table for rows where ALL kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> <p>Examples:</p> <pre><code>t = Table()\nt['a'] = [1,2,3,4]\nt['b'] = [10,20,30,40]\n\ndef f(x):\n    return x == 4\ndef g(x):\n    return x &lt; 20\n\nt2 = t.any( **{\"a\":f, \"b\":g})\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\nt2 = t.any(a=f,b=g)\nassert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\ndef h(x):\n    return x&gt;=2\n\ndef i(x):\n    return x&lt;=30\n\nt2 = t.all(a=h,b=i)\nassert [r for r in t2.rows] == [[2,20], [3, 30]]\n</code></pre> Source code in <code>tablite/redux.py</code> <pre><code>def filter_all(T, **kwargs):\n    \"\"\"\n    returns Table for rows where ALL kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n\n    Examples:\n\n        t = Table()\n        t['a'] = [1,2,3,4]\n        t['b'] = [10,20,30,40]\n\n        def f(x):\n            return x == 4\n        def g(x):\n            return x &lt; 20\n\n        t2 = t.any( **{\"a\":f, \"b\":g})\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        t2 = t.any(a=f,b=g)\n        assert [r for r in t2.rows] == [[1, 10], [4, 40]]\n\n        def h(x):\n            return x&gt;=2\n\n        def i(x):\n            return x&lt;=30\n\n        t2 = t.all(a=h,b=i)\n        assert [r for r in t2.rows] == [[2,20], [3, 30]]\n\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    if not isinstance(kwargs, dict):\n        raise TypeError(\"did you forget to add the ** in front of your dict?\")\n    if not all([k in T.columns for k in kwargs]):\n        raise ValueError(f\"Unknown column(s): {[k for k in kwargs if k not in T.columns]}\")\n\n    mask = np.full((len(T),), True)\n    for k, v in kwargs.items():\n        col = T[k]\n        for start, end, page in col.iter_by_page():\n            data = page.get()\n            if callable(v):\n                vf = np.frompyfunc(v, 1, 1)\n                mask[start:end] = mask[start:end] &amp; np.apply_along_axis(vf, 0, data)\n            else:\n                mask[start:end] = mask[start:end] &amp; (data == v)\n\n    return _compress_one(T, mask)\n</code></pre>"},{"location":"reference/redux/#tablite.redux.drop","title":"<code>tablite.redux.drop(T, *args)</code>","text":"<p>drops all rows that contain args</p> PARAMETER  DESCRIPTION <code>T</code> <p> TYPE: <code>Table</code> </p> Source code in <code>tablite/redux.py</code> <pre><code>def drop(T, *args):\n    \"\"\"drops all rows that contain args\n\n    Args:\n        T (Table):\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    mask = np.full((len(T),), False)\n    for name in T.columns:\n        col = T[name]\n        for start, end, page in col.iter_by_page():\n            data = page.get()\n            for arg in args:\n                mask[start:end] = mask[start:end] | (data == arg)\n\n    mask = np.invert(mask)\n    return _compress_one(T, mask)\n</code></pre>"},{"location":"reference/redux/#tablite.redux.filter_any","title":"<code>tablite.redux.filter_any(T, **kwargs)</code>","text":"<p>returns Table for rows where ANY kwargs match :param kwargs: dictionary with headers and values / boolean callable</p> Source code in <code>tablite/redux.py</code> <pre><code>def filter_any(T, **kwargs):\n    \"\"\"\n    returns Table for rows where ANY kwargs match\n    :param kwargs: dictionary with headers and values / boolean callable\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    if not isinstance(kwargs, dict):\n        raise TypeError(\"did you forget to add the ** in front of your dict?\")\n\n    mask = np.full((len(T),), False)\n    for k, v in kwargs.items():\n        col = T[k]\n        for start, end, page in col.iter_by_page():\n            data = page.get()\n            if callable(v):\n                vf = np.frompyfunc(v, 1, 1)\n                mask[start:end] = mask[start:end] | np.apply_along_axis(vf, 0, data)\n            else:\n                mask[start:end] = mask[start:end] | (v == data)\n\n    return _compress_one(T, mask)\n</code></pre>"},{"location":"reference/redux/#tablite.redux.compress_both","title":"<code>tablite.redux.compress_both(T, mask, pbar: _tqdm)</code>","text":"Source code in <code>tablite/redux.py</code> <pre><code>def compress_both(T, mask, pbar: _tqdm):\n    # NOTE FOR DEVELOPERS:\n    # np.compress is so fast that the overhead of multiprocessing doesn't pay off.\n    cls = type(T)\n    true, false = cls(), cls()\n\n    pbar_div = (len(T.columns) * len(list(Config.page_steps(len(T)))) - 1)\n    pbar_step = (10 / pbar_div) if pbar_div != 0 else 0\n\n    for name in T.columns:\n        true.add_column(name)\n        false.add_column(name)\n        true_col = true[name]  # fetch the col to avoid doing it in the loop below\n        false_col = false[name]\n        # prevent OOMError by slicing the getitem ops\n        for start, end in Config.page_steps(len(T)):\n            data = T[name][start:end]\n            true_col.extend(np.compress(mask[start:end], data))\n            false_col.extend(np.compress(np.invert(mask)[start:end], data))\n            if pbar is not None:\n                pbar.update(pbar_step)\n    return true, false\n</code></pre>"},{"location":"reference/redux/#tablite.redux.get_filter_bitmap","title":"<code>tablite.redux.get_filter_bitmap(T, expressions, pbar: _tqdm)</code>","text":"Source code in <code>tablite/redux.py</code> <pre><code>def get_filter_bitmap(T, expressions, pbar: _tqdm):\n    for expression in expressions:\n        if not isinstance(expression, dict):\n            raise TypeError(f\"invalid expression: {expression}\")\n        if not len(expression) == 3:\n            raise ValueError(f\"expected 3 items, got {expression}\")\n        x = {\"column1\", \"column2\", \"criteria\", \"value1\", \"value2\"}\n        if not set(expression.keys()).issubset(x):\n            raise ValueError(f\"got unknown key: {set(expression.keys()).difference(x)}\")\n\n        if expression[\"criteria\"] not in filter_ops:\n            raise ValueError(f\"criteria missing from {expression}\")\n\n        c1 = expression.get(\"column1\", None)\n        if c1 is not None and c1 not in T.columns:\n            raise ValueError(f\"no such column: {c1}\")\n\n        v1 = expression.get(\"value1\", None)\n        if v1 is not None and c1 is not None:\n            raise ValueError(\"filter can only take 1 left expr element. Got 2.\")\n\n        c2 = expression.get(\"column2\", None)\n        if c2 is not None and c2 not in T.columns:\n            raise ValueError(f\"no such column: {c2}\")\n\n        v2 = expression.get(\"value2\", None)\n        if v2 is not None and c2 is not None:\n            raise ValueError(\"filter can only take 1 right expression element. Got 2.\")\n\n    # EVALUATION....\n    # 1. setup a rectangular bitmap for evaluations\n    bitmap = np.empty(shape=(len(expressions), len(T)), dtype=bool)\n    pbar_div = (len(expressions) * len(list(Config.page_steps(len(T)))) - 1)\n    pbar_step = (10 / pbar_div) if pbar_div != 0 else 0\n    # 2. create tasks for evaluations\n    for bit_index, expression in enumerate(expressions):\n        assert isinstance(expression, dict)\n        assert len(expression) == 3\n        c1 = expression.get(\"column1\", None)\n        c2 = expression.get(\"column2\", None)\n        expr = expression.get(\"criteria\", None)\n        assert expr in filter_ops\n        v1 = expression.get(\"value1\", None)\n        v2 = expression.get(\"value2\", None)\n\n        for start, end in Config.page_steps(len(T)):\n            if c1 is not None:\n                dset_A = T[c1][start:end]\n            else:  # v1 is active:\n                dset_A = np.array([v1] * (end - start))\n\n            if c2 is not None:\n                dset_B = T[c2][start:end]\n            else:  # v2 is active:\n                dset_B = np.array([v2] * (end - start))\n\n            if len(dset_A) != len(dset_B):\n                raise ValueError(\n                    f\"Assymmetric dataset: {c1} has {len(dset_A)} values, whilst {c2} has {len(dset_B)} values.\"\n                )\n            # Evaluate\n            try:\n                if expr == \"&gt;\":\n                    result = dset_A &gt; dset_B\n                elif expr == \"&gt;=\":\n                    result = dset_A &gt;= dset_B\n                elif expr == \"==\":\n                    result = dset_A == dset_B\n                elif expr == \"&lt;\":\n                    result = dset_A &lt; dset_B\n                elif expr == \"&lt;=\":\n                    result = dset_A &lt;= dset_B\n                elif expr == \"!=\":\n                    result = dset_A != dset_B\n                else:  # it's a python evaluations (slow)\n                    f = filter_ops.get(expr)\n                    assert callable(f)\n                    result = list_to_np_array([f(a, b) for a, b in zip(dset_A, dset_B)])\n            except TypeError:\n                def safe_test(f, a, b):\n                    try:\n                        return f(a, b)\n                    except TypeError:\n                        return False\n                f = filter_ops.get(expr)\n                assert callable(f)\n                result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)])\n            bitmap[bit_index, start:end] = result\n            if pbar is not None:\n                pbar.update(pbar_step)\n\n    return bitmap\n</code></pre>"},{"location":"reference/redux/#tablite.redux.filter_non_primitive","title":"<code>tablite.redux.filter_non_primitive(T, expressions, filter_type='all', tqdm=_tqdm)</code>","text":"<p>OBSOLETE filters table</p> PARAMETER  DESCRIPTION <code>T</code> <p>Table.</p> <p> TYPE: <code>Table subclass</code> </p> <code>expressions</code> <p>str:     filters based on an expression, such as:     \"all((A==B, C!=4, 200&lt;D))\"     which is interpreted using python's compiler to:</p> <pre><code>def _f(A,B,C,D):\n    return all((A==B, C!=4, 200&lt;D))\n</code></pre> <p>list of dicts: (example):</p> <p>L = [     {'column1':'A', 'criteria': \"==\", 'column2': 'B'},     {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},     {'value1': 200, 'criteria': \"&lt;\", column2: 'D' } ]</p> <p> TYPE: <code>list or str</code> </p> <code>accepted</code> <p>'column1', 'column2', 'criteria', 'value1', 'value2'</p> <p> TYPE: <code>dictionary keys</code> </p> <code>filter_type</code> <p>Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'all'</code> </p> <code>tqdm</code> <p>progressbar. Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <code>2xTables</code> <p>trues, falses</p> Source code in <code>tablite/redux.py</code> <pre><code>def filter_non_primitive(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n    \"\"\"\n    OBSOLETE\n    filters table\n\n\n    Args:\n        T (Table subclass): Table.\n        expressions (list or str):\n            str:\n                filters based on an expression, such as:\n                \"all((A==B, C!=4, 200&lt;D))\"\n                which is interpreted using python's compiler to:\n\n                def _f(A,B,C,D):\n                    return all((A==B, C!=4, 200&lt;D))\n\n            list of dicts: (example):\n\n            L = [\n                {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n                {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n                {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n            ]\n\n        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n        filter_type (str, optional): Ignored if expressions is str.\n            'all' or 'any'. Defaults to \"all\".\n        tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n    Returns:\n        2xTables: trues, falses\n    \"\"\"\n    # determine method\n    warnings.warn(\"Filter using non-primitive types is not recommended.\")\n    sub_cls_check(T, BaseTable)\n    if len(T) == 0:\n        return T.copy(), T.copy()\n\n    with tqdm(desc=\"filter\", total=20) as pbar:\n        if isinstance(expressions, str):\n            mask = _filter_using_expression(T, expressions)\n            pbar.update(10)\n        elif isinstance(expressions, list):\n            mask = _filter_using_list_of_dicts(T, expressions, filter_type, pbar)\n        else:\n            raise TypeError\n        # create new tables\n        res = compress_both(T, mask, pbar=pbar)\n        pbar.update(pbar.total - pbar.n)\n\n        return res\n</code></pre>"},{"location":"reference/redux/#tablite.redux.filter","title":"<code>tablite.redux.filter(T, expressions, filter_type='all', tqdm=_tqdm)</code>","text":"<p>filters table Note: At the moment only tablite primitive types are supported</p> PARAMETER  DESCRIPTION <code>T</code> <p>Table.</p> <p> TYPE: <code>Table subclass</code> </p> <code>expressions</code> <p>str:     filters based on an expression, such as:     \"all((A==B, C!=4, 200&lt;D))\"     which is interpreted using python's compiler to:</p> <pre><code>def _f(A,B,C,D):\n    return all((A==B, C!=4, 200&lt;D))\n</code></pre> <p>list of dicts: (example):</p> <p>L = [     {'column1':'A', 'criteria': \"==\", 'column2': 'B'},     {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},     {'value1': 200, 'criteria': \"&lt;\", column2: 'D' } ]</p> <p> TYPE: <code>list or str</code> </p> <code>accepted</code> <p>'column1', 'column2', 'criteria', 'value1', 'value2'</p> <p> TYPE: <code>dictionary keys</code> </p> <code>filter_type</code> <p>Ignored if expressions is str. 'all' or 'any'. Defaults to \"all\".</p> <p> TYPE: <code>str</code> DEFAULT: <code>'all'</code> </p> <code>tqdm</code> <p>progressbar. Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> RETURNS DESCRIPTION <code>2xTables</code> <p>trues, falses</p> Source code in <code>tablite/redux.py</code> <pre><code>def filter(T, expressions, filter_type=\"all\", tqdm=_tqdm):\n    \"\"\"filters table\n    Note: At the moment only tablite primitive types are supported\n\n    Args:\n        T (Table subclass): Table.\n        expressions (list or str):\n            str:\n                filters based on an expression, such as:\n                \"all((A==B, C!=4, 200&lt;D))\"\n                which is interpreted using python's compiler to:\n\n                def _f(A,B,C,D):\n                    return all((A==B, C!=4, 200&lt;D))\n\n            list of dicts: (example):\n\n            L = [\n                {'column1':'A', 'criteria': \"==\", 'column2': 'B'},\n                {'column1':'C', 'criteria': \"!=\", \"value2\": '4'},\n                {'value1': 200, 'criteria': \"&lt;\", column2: 'D' }\n            ]\n\n        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'\n\n        filter_type (str, optional): Ignored if expressions is str.\n            'all' or 'any'. Defaults to \"all\".\n        tqdm (tqdm, optional): progressbar. Defaults to _tqdm.\n\n    Returns:\n        2xTables: trues, falses\n    \"\"\"\n    # determine method\n    sub_cls_check(T, BaseTable)\n    if len(T) == 0:\n        return T.copy(), T.copy()\n\n    if isinstance(expressions, str):\n        with tqdm(desc=\"filter\", total=20) as pbar:\n            # TODO: make parser for expressions and use the nim implement\n            mask = _filter_using_expression(T, expressions)\n            pbar.update(10)\n            res = compress_both(T, mask, pbar=pbar)\n            pbar.update(pbar.total - pbar.n)\n    elif isinstance(expressions, list):\n        return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)\n    else:\n        raise TypeError\n        # create new tables\n\n    return res\n</code></pre>"},{"location":"reference/reindex/","title":"Reindex","text":""},{"location":"reference/reindex/#tablite.reindex","title":"<code>tablite.reindex</code>","text":""},{"location":"reference/reindex/#tablite.reindex-classes","title":"Classes","text":""},{"location":"reference/reindex/#tablite.reindex-functions","title":"Functions","text":""},{"location":"reference/reindex/#tablite.reindex.reindex","title":"<code>tablite.reindex.reindex(T, index, names=None, tqdm=_tqdm, pbar=None)</code>","text":"<p>Constant Memory helper for reindexing pages.</p> <p>Memory usage is set by datatype and Config.PAGE_SIZE</p> PARAMETER  DESCRIPTION <code>T</code> <p>subclass of Table</p> <p> TYPE: <code>Table</code> </p> <code>index</code> <p>int64.</p> <p> TYPE: <code>array</code> </p> <code>names</code> <p>list of names from T to reindex.</p> <p> TYPE: <code>(list, str)</code> DEFAULT: <code>None</code> </p> <code>tqdm</code> <p>Defaults to _tqdm.</p> <p> TYPE: <code>tqdm</code> DEFAULT: <code>tqdm</code> </p> <code>pbar</code> <p>Defaults to None.</p> <p> TYPE: <code>pbar</code> DEFAULT: <code>None</code> </p> RETURNS DESCRIPTION <code>_type_</code> <p>description</p> Source code in <code>tablite/reindex.py</code> <pre><code>def reindex(T, index, names=None, tqdm=_tqdm, pbar=None):\n    \"\"\"Constant Memory helper for reindexing pages.\n\n    Memory usage is set by datatype and Config.PAGE_SIZE\n\n    Args:\n        T (Table): subclass of Table\n        index (np.array): int64.\n        names (list, str): list of names from T to reindex.\n        tqdm (tqdm, optional): Defaults to _tqdm.\n        pbar (pbar, optional): Defaults to None.\n\n    Returns:\n        _type_: _description_\n    \"\"\"\n    if names is None:\n        names = list(T.columns.keys())\n\n    if pbar is None:\n        total = len(names)\n        pbar = tqdm(total=total, desc=\"join\", disable=Config.TQDM_DISABLE)\n\n    sub_cls_check(T, BaseTable)\n    cls = type(T)\n    result = cls()\n    for name in names:\n        result.add_column(name)\n        col = result[name]\n\n        for start, end in Config.page_steps(len(index)):\n            indices = index[start:end]\n            values = T[name].get_by_indices(indices)\n            # in these values, the index of -1 will be wrong.\n            # so if there is any -1 in the indices, they will\n            # have to be replaced with Nones\n            mask = indices == -1\n            if np.any(mask):\n                nones = np.full(index.shape, fill_value=None)\n                values = np.where(mask, nones, values)\n            col.extend(values)\n        pbar.update(1)\n\n    return result\n</code></pre>"},{"location":"reference/sort_utils/","title":"Sort utils","text":""},{"location":"reference/sort_utils/#tablite.sort_utils","title":"<code>tablite.sort_utils</code>","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-attributes","title":"Attributes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.uca_collator","title":"<code>tablite.sort_utils.uca_collator = Collator()</code>  <code>module-attribute</code>","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.modes","title":"<code>tablite.sort_utils.modes = {'alphanumeric': text_sort, 'unix': unix_sort, 'excel': excel_sort}</code>  <code>module-attribute</code>","text":""},{"location":"reference/sort_utils/#tablite.sort_utils-classes","title":"Classes","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict","title":"<code>tablite.sort_utils.HashDict</code>","text":"<p>             Bases: <code>dict</code></p> <p>This class is just a nicity syntatic sugar for debugging. Function identically to regular dictionary, just uses tupled key.</p>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.items","title":"<code>tablite.sort_utils.HashDict.items()</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def items(self):\n    return [(k, v) for (_, k), v in super().items()]\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.keys","title":"<code>tablite.sort_utils.HashDict.keys()</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def keys(self):\n    return [k for (_, k) in super().keys()]\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__iter__","title":"<code>tablite.sort_utils.HashDict.__iter__() -&gt; Iterator</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __iter__(self) -&gt; Iterator:\n    return (k for (_, k) in super().keys())\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__getitem__","title":"<code>tablite.sort_utils.HashDict.__getitem__(key)</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __getitem__(self, key):\n    return super().__getitem__(self._get_hash(key))\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__setitem__","title":"<code>tablite.sort_utils.HashDict.__setitem__(key, value)</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __setitem__(self, key, value):\n    return super().__setitem__(self._get_hash(key), value)\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__contains__","title":"<code>tablite.sort_utils.HashDict.__contains__(key) -&gt; bool</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __contains__(self, key) -&gt; bool:\n    return super().__contains__(self._get_hash(key))\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__delitem__","title":"<code>tablite.sort_utils.HashDict.__delitem__(key)</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __delitem__(self, key):\n    return super().__delitem__(self._get_hash(key))\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__repr__","title":"<code>tablite.sort_utils.HashDict.__repr__() -&gt; str</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __repr__(self) -&gt; str:\n    return '{' + \", \".join([f\"{k}: {v}\" for k, v in self.items()]) + '}'\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.HashDict.__str__","title":"<code>tablite.sort_utils.HashDict.__str__() -&gt; str</code>","text":"Source code in <code>tablite/sort_utils.py</code> <pre><code>def __str__(self) -&gt; str:\n    return repr(self)\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils-functions","title":"Functions","text":""},{"location":"reference/sort_utils/#tablite.sort_utils.text_sort","title":"<code>tablite.sort_utils.text_sort(values, reverse=False)</code>","text":"<p>Sorts everything as text.</p> Source code in <code>tablite/sort_utils.py</code> <pre><code>def text_sort(values, reverse=False):\n    \"\"\"\n    Sorts everything as text.\n    \"\"\"\n    text = {str(i): i for i in values}\n    L = list(text.keys())\n    L.sort(key=uca_collator.sort_key, reverse=reverse)\n    d = {text[value]: ix for ix, value in enumerate(L)}\n    return d\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.unix_sort","title":"<code>tablite.sort_utils.unix_sort(values, reverse=False)</code>","text":"<p>Unix sortation sorts by the following order:</p> <p>| rank | type      | value                                      | +------+-----------+--------------------------------------------+ |   0  | None      | floating point -infinite                   | |   1  | bool      | 0 as False, 1 as True                      | |   2  | int       | as numeric value                           | |   2  | float     | as numeric value                           | |   3  | time      | \u03c4 * seconds into the day / (24 * 60 * 60)  | |   4  | date      | as integer days since 1970/1/1             | |   5  | datetime  | as float using date (int) + time (decimal) | |   6  | timedelta | as float using date (int) + time (decimal) | |   7  | str       | using unicode                              | +------+-----------+--------------------------------------------+</p> <p>\u03c4 = 2 * \u03c0</p> Source code in <code>tablite/sort_utils.py</code> <pre><code>def unix_sort(values, reverse=False):\n    \"\"\"\n    Unix sortation sorts by the following order:\n\n    | rank | type      | value                                      |\n    +------+-----------+--------------------------------------------+\n    |   0  | None      | floating point -infinite                   |\n    |   1  | bool      | 0 as False, 1 as True                      |\n    |   2  | int       | as numeric value                           |\n    |   2  | float     | as numeric value                           |\n    |   3  | time      | \u03c4 * seconds into the day / (24 * 60 * 60)  |\n    |   4  | date      | as integer days since 1970/1/1             |\n    |   5  | datetime  | as float using date (int) + time (decimal) |\n    |   6  | timedelta | as float using date (int) + time (decimal) |\n    |   7  | str       | using unicode                              |\n    +------+-----------+--------------------------------------------+\n\n    \u03c4 = 2 * \u03c0\n\n    \"\"\"\n    text, non_text = [], []\n\n    # L = []\n    # text = [i for i in values if isinstance(i, str)]\n    # text.sort(key=uca_collator.sort_key, reverse=reverse)\n    # text_code = _unix_typecodes[str]\n    # L = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n    for value in values:\n        if isinstance(value, str):\n            text.append(value)\n        else:\n            t = type(value)\n            TC = _unix_typecodes[t]\n            tf = _unix_value_function[t]\n            VC = tf(value)\n            non_text.append((TC, VC, value))\n    non_text.sort(reverse=reverse)\n\n    text.sort(key=uca_collator.sort_key, reverse=reverse)\n    text_code = _unix_typecodes[str]\n    text = [(text_code, ix, v) for ix, v in enumerate(text)]\n\n    d = HashDict()\n    L = non_text + text\n    for ix, (_, _, value) in enumerate(L):\n        d[value] = ix\n    return d\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.excel_sort","title":"<code>tablite.sort_utils.excel_sort(values, reverse=False)</code>","text":"<p>Excel sortation sorts by the following order:</p> <p>| rank | type      | value                                      | +------+-----------+--------------------------------------------+ |   1  | int       | as numeric value                           | |   1  | float     | as numeric value                           | |   1  | time      | as seconds into the day / (24 * 60 * 60)   | |   1  | date      | as integer days since 1900/1/1             | |   1  | datetime  | as float using date (int) + time (decimal) | |  (1)*| timedelta | as float using date (int) + time (decimal) | |   2  | str       | using unicode                              | |   3  | bool      | 0 as False, 1 as True                      | |   4  | None      | floating point infinite.                   | +------+-----------+--------------------------------------------+</p> <ul> <li>Excel doesn't have timedelta.</li> </ul> Source code in <code>tablite/sort_utils.py</code> <pre><code>def excel_sort(values, reverse=False):\n    \"\"\"\n    Excel sortation sorts by the following order:\n\n    | rank | type      | value                                      |\n    +------+-----------+--------------------------------------------+\n    |   1  | int       | as numeric value                           |\n    |   1  | float     | as numeric value                           |\n    |   1  | time      | as seconds into the day / (24 * 60 * 60)   |\n    |   1  | date      | as integer days since 1900/1/1             |\n    |   1  | datetime  | as float using date (int) + time (decimal) |\n    |  (1)*| timedelta | as float using date (int) + time (decimal) |\n    |   2  | str       | using unicode                              |\n    |   3  | bool      | 0 as False, 1 as True                      |\n    |   4  | None      | floating point infinite.                   |\n    +------+-----------+--------------------------------------------+\n\n    * Excel doesn't have timedelta.\n    \"\"\"\n\n    def tup(TC, value):\n        return (TC, _excel_value_function[t](value), value)\n\n    text, numeric, booles, nones = [], [], [], []\n    for value in values:\n        t = type(value)\n        TC = _excel_typecodes[t]\n\n        if TC == 0:\n            numeric.append(tup(TC, value))\n        elif TC == 1:\n            text.append(value)  # text is processed later.\n        elif TC == 2:\n            booles.append(tup(TC, value))\n        elif TC == 3:\n            booles.append(tup(TC, value))\n        else:\n            raise TypeError(f\"no typecode for {value}\")\n\n    if text:\n        text.sort(key=uca_collator.sort_key, reverse=reverse)\n        text = [(2, ix, v) for ix, v in enumerate(text)]\n\n    numeric.sort(reverse=reverse)\n    booles.sort(reverse=reverse)\n    nones.sort(reverse=reverse)\n\n    if reverse:\n        L = nones + booles + text + numeric\n    else:\n        L = numeric + text + booles + nones\n    d = {value: ix for ix, (_, _, value) in enumerate(L)}\n    return d\n</code></pre>"},{"location":"reference/sort_utils/#tablite.sort_utils.rank","title":"<code>tablite.sort_utils.rank(values, reverse, mode)</code>","text":"<p>values: list of values to sort. reverse: bool mode: as 'text', as 'numeric' or as 'excel' return: dict: d[value] = rank</p> Source code in <code>tablite/sort_utils.py</code> <pre><code>def rank(values, reverse, mode):\n    \"\"\"\n    values: list of values to sort.\n    reverse: bool\n    mode: as 'text', as 'numeric' or as 'excel'\n    return: dict: d[value] = rank\n    \"\"\"\n    if mode not in modes:\n        raise ValueError(f\"{mode} not in list of modes: {list(modes)}\")\n    f = modes.get(mode)\n    return f(values, reverse)\n</code></pre>"},{"location":"reference/sortation/","title":"Sortation","text":""},{"location":"reference/sortation/#tablite.sortation","title":"<code>tablite.sortation</code>","text":""},{"location":"reference/sortation/#tablite.sortation-attributes","title":"Attributes","text":""},{"location":"reference/sortation/#tablite.sortation-classes","title":"Classes","text":""},{"location":"reference/sortation/#tablite.sortation-functions","title":"Functions","text":""},{"location":"reference/sortation/#tablite.sortation.sort_index","title":"<code>tablite.sortation.sort_index(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar=None)</code>","text":"<p>helper for methods <code>sort</code> and <code>is_sorted</code></p> <p>param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default) param: **kwargs: sort criteria. See Table.sort()</p> Source code in <code>tablite/sortation.py</code> <pre><code>def sort_index(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar=None):\n    \"\"\"\n    helper for methods `sort` and `is_sorted`\n\n    param: sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" (default)\n    param: **kwargs: sort criteria. See Table.sort()\n    \"\"\"\n\n    sub_cls_check(T, BaseTable)\n\n    if not isinstance(mapping, dict) or not mapping:\n        raise TypeError(\"Expected mapping (dict)?\")\n\n    for k, v in mapping.items():\n        if k not in T.columns:\n            raise ValueError(f\"no column {k}\")\n        if not isinstance(v, bool):\n            raise ValueError(f\"{k} was mapped to {v} - a non-boolean\")\n\n    if sort_mode not in sort_modes:\n        raise ValueError(f\"{sort_mode} not in list of sort_modes: {list(sort_modes)}\")\n\n    rank = {i: tuple() for i in range(len(T))}  # create index and empty tuple for sortation.\n\n    _pbar = tqdm(total=len(mapping.items()), desc=\"creating sort index\") if pbar is None else pbar\n\n    for key, reverse in mapping.items():\n        col = T[key][:]\n        ranks = sort_rank(values=[numpy_to_python(v) for v in multitype_set(col)], reverse=reverse, mode=sort_mode)\n        assert isinstance(ranks, dict)\n        for ix, v in enumerate(col):\n            v2 = numpy_to_python(v)\n            rank[ix] += (ranks[v2],)  # add tuple for each sortation level.\n\n        _pbar.update(1)\n\n    del col\n    del ranks\n\n    new_order = [(r, i) for i, r in rank.items()]  # tuples are listed and sort...\n    del rank  # free memory.\n\n    new_order.sort()\n    sorted_index = [i for _, i in new_order]  # new index is extracted.\n    new_order.clear()\n    return np.array(sorted_index, dtype=np.int64)\n</code></pre>"},{"location":"reference/sortation/#tablite.sortation.reindex","title":"<code>tablite.sortation.reindex(T, index)</code>","text":"<p>index: list of integers that declare sort order.</p> <p>Examples:</p> <pre><code>Table:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6]\nresult: ['b','d','f','h']\n\nTable:  ['a','b','c','d','e','f','g','h']\nindex:  [0,2,4,6,1,3,5,7]\nresult: ['a','c','e','g','b','d','f','h']\n</code></pre> Source code in <code>tablite/sortation.py</code> <pre><code>def reindex(T, index):\n    \"\"\"\n    index: list of integers that declare sort order.\n\n    Examples:\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6]\n        result: ['b','d','f','h']\n\n        Table:  ['a','b','c','d','e','f','g','h']\n        index:  [0,2,4,6,1,3,5,7]\n        result: ['a','c','e','g','b','d','f','h']\n\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n    if isinstance(index, list):\n        index = np.array(index, dtype=int)\n    type_check(index, np.ndarray)\n    if max(index) &gt;= len(T):\n        raise IndexError(\"index out of range: max(index) &gt; len(self)\")\n    if min(index) &lt; -len(T):\n        raise IndexError(\"index out of range: min(index) &lt; -len(self)\")\n\n    fields = len(T) * len(T.columns)\n    m = select_processing_method(fields, _reindex, _mp_reindex)\n    return m(T, index)\n</code></pre>"},{"location":"reference/sortation/#tablite.sortation.sort","title":"<code>tablite.sortation.sort(T, mapping, sort_mode='excel', tqdm=_tqdm, pbar: _tqdm = None)</code>","text":"<p>Perform multi-pass sorting with precedence given order of column names. sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\" kwargs:     keys: columns,     values: 'reverse' as boolean.</p> <p>examples: Table.sort('A'=False) means sort by 'A' in ascending order. Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority) sort B in ascending order.</p> Source code in <code>tablite/sortation.py</code> <pre><code>def sort(T, mapping, sort_mode=\"excel\", tqdm=_tqdm, pbar: _tqdm = None):\n    \"\"\"Perform multi-pass sorting with precedence given order of column names.\n    sort_mode: str: \"alphanumeric\", \"unix\", or, \"excel\"\n    kwargs:\n        keys: columns,\n        values: 'reverse' as boolean.\n\n    examples:\n    Table.sort('A'=False) means sort by 'A' in ascending order.\n    Table.sort('A'=True, 'B'=False) means sort 'A' in descending order, then (2nd priority)\n    sort B in ascending order.\n    \"\"\"\n    sub_cls_check(T, BaseTable)\n\n    index = sort_index(T, mapping, sort_mode=sort_mode, tqdm=_tqdm, pbar=pbar)\n    m = select_processing_method(len(T) * len(T.columns), _sp_reindex, _mp_reindex)\n    return m(T, index, tqdm=tqdm, pbar=pbar)\n</code></pre>"},{"location":"reference/sortation/#tablite.sortation.is_sorted","title":"<code>tablite.sortation.is_sorted(T, mapping, sort_mode='excel')</code>","text":"<p>Performs multi-pass sorting check with precedence given order of column names.</p> PARAMETER  DESCRIPTION <code>mapping</code> <p>sort criteria. See Table.sort()</p> <p> </p> RETURNS DESCRIPTION <p>bool</p> Source code in <code>tablite/sortation.py</code> <pre><code>def is_sorted(T, mapping, sort_mode=\"excel\"):\n    \"\"\"Performs multi-pass sorting check with precedence given order of column names.\n\n    Args:\n        mapping: sort criteria. See Table.sort()\n        sort_mode = sort mode. See Table.sort()\n\n    Returns:\n        bool\n    \"\"\"\n    index = sort_index(T, mapping, sort_mode=sort_mode)\n    match = np.arange(len(T))\n    return np.all(index == match)\n</code></pre>"},{"location":"reference/tools/","title":"Tools","text":""},{"location":"reference/tools/#tablite.tools","title":"<code>tablite.tools</code>","text":""},{"location":"reference/tools/#tablite.tools-attributes","title":"Attributes","text":""},{"location":"reference/tools/#tablite.tools.guess","title":"<code>tablite.tools.guess = DataTypes.guess</code>  <code>module-attribute</code>","text":""},{"location":"reference/tools/#tablite.tools.xround","title":"<code>tablite.tools.xround = DataTypes.round</code>  <code>module-attribute</code>","text":""},{"location":"reference/tools/#tablite.tools-classes","title":"Classes","text":""},{"location":"reference/tools/#tablite.tools-functions","title":"Functions","text":""},{"location":"reference/tools/#tablite.tools.head","title":"<code>tablite.tools.head(path, linecount=5, delimiter=None)</code>","text":"<p>Gets the head of any supported file format.</p> Source code in <code>tablite/tools.py</code> <pre><code>def head(path, linecount=5, delimiter=None):\n    \"\"\"\n    Gets the head of any supported file format.\n    \"\"\"\n    return get_headers(path, linecount=linecount, delimiter=delimiter)\n</code></pre>"},{"location":"reference/utils/","title":"Utils","text":""},{"location":"reference/utils/#tablite.utils","title":"<code>tablite.utils</code>","text":""},{"location":"reference/utils/#tablite.utils-attributes","title":"Attributes","text":""},{"location":"reference/utils/#tablite.utils.letters","title":"<code>tablite.utils.letters = string.ascii_lowercase + string.digits</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils.NoneType","title":"<code>tablite.utils.NoneType = type(None)</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils.required_keys","title":"<code>tablite.utils.required_keys = {'min', 'max', 'mean', 'median', 'stdev', 'mode', 'distinct', 'iqr_low', 'iqr_high', 'iqr', 'sum', 'summary type', 'histogram'}</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils.summary_methods","title":"<code>tablite.utils.summary_methods = {bool: _boolean_statistics_summary, int: _numeric_statistics_summary, float: _numeric_statistics_summary, str: _string_statistics_summary, date: _date_statistics_summary, datetime: _datetime_statistics_summary, time: _time_statistics_summary, timedelta: _timedelta_statistics_summary, type(None): _none_type_summary}</code>  <code>module-attribute</code>","text":""},{"location":"reference/utils/#tablite.utils-classes","title":"Classes","text":""},{"location":"reference/utils/#tablite.utils-functions","title":"Functions","text":""},{"location":"reference/utils/#tablite.utils.generate_random_string","title":"<code>tablite.utils.generate_random_string(len)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def generate_random_string(len):\n    return \"\".join(random.choice(letters) for i in range(len))\n</code></pre>"},{"location":"reference/utils/#tablite.utils.type_check","title":"<code>tablite.utils.type_check(var, kind)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def type_check(var, kind):\n    if not isinstance(var, kind):\n        raise TypeError(f\"Expected {kind}, not {type(var)}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.sub_cls_check","title":"<code>tablite.utils.sub_cls_check(c, kind)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def sub_cls_check(c, kind):\n    if not issubclass(type(c), kind):\n        raise TypeError(f\"Expected {kind}, not {type(c)}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.name_check","title":"<code>tablite.utils.name_check(options, *names)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def name_check(options, *names):\n    for n in names:\n        if n not in options:\n            raise ValueError(f\"{n} not in {options}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.unique_name","title":"<code>tablite.utils.unique_name(wanted_name, set_of_names)</code>","text":"<p>returns a wanted_name as wanted_name_i given a list of names which guarantees unique naming.</p> Source code in <code>tablite/utils.py</code> <pre><code>def unique_name(wanted_name, set_of_names):\n    \"\"\"\n    returns a wanted_name as wanted_name_i given a list of names\n    which guarantees unique naming.\n    \"\"\"\n    if not isinstance(set_of_names, set):\n        set_of_names = set(set_of_names)\n    name, i = wanted_name, 1\n    while name in set_of_names:\n        name = f\"{wanted_name}_{i}\"\n        i += 1\n    return name\n</code></pre>"},{"location":"reference/utils/#tablite.utils.expression_interpreter","title":"<code>tablite.utils.expression_interpreter(expression, columns)</code>","text":"<p>Interprets valid expressions such as:</p> <pre><code>\"all((A==B, C!=4, 200&lt;D))\"\n</code></pre> as <p>def _f(A,B,C,D):     return all((A==B, C!=4, 200&lt;D))</p> <p>using python's compiler.</p> Source code in <code>tablite/utils.py</code> <pre><code>def expression_interpreter(expression, columns):\n    \"\"\"\n    Interprets valid expressions such as:\n\n        \"all((A==B, C!=4, 200&lt;D))\"\n\n    as:\n        def _f(A,B,C,D):\n            return all((A==B, C!=4, 200&lt;D))\n\n    using python's compiler.\n    \"\"\"\n    if not isinstance(expression, str):\n        raise TypeError(f\"`{expression}` is not a str\")\n    if not isinstance(columns, list):\n        raise TypeError\n    if not all(isinstance(i, str) for i in columns):\n        raise TypeError\n\n    req_columns = \", \".join(i for i in columns if i in expression)\n    script = f\"def f({req_columns}):\\n    return {expression}\"\n    tree = ast.parse(script)\n    code = compile(tree, filename=\"blah\", mode=\"exec\")\n    namespace = {}\n    exec(code, namespace)\n    f = namespace[\"f\"]\n    if not callable(f):\n        raise ValueError(f\"The expression could not be parse: {expression}\")\n    return f\n</code></pre>"},{"location":"reference/utils/#tablite.utils.intercept","title":"<code>tablite.utils.intercept(A, B)</code>","text":"<p>Enables calculation of the intercept of two range objects. Used to determine if a datablock contains a slice.</p> PARAMETER  DESCRIPTION <code>A</code> <p>range</p> <p> </p> <code>B</code> <p>range</p> <p> </p> RETURNS DESCRIPTION <code>range</code> <p>The intercept of ranges A and B.</p> Source code in <code>tablite/utils.py</code> <pre><code>def intercept(A, B):\n    \"\"\"Enables calculation of the intercept of two range objects.\n    Used to determine if a datablock contains a slice.\n\n    Args:\n        A: range\n        B: range\n\n    Returns:\n        range: The intercept of ranges A and B.\n    \"\"\"\n    type_check(A, range)\n    type_check(B, range)\n\n    if A.step &lt; 1:\n        A = range(A.stop + 1, A.start + 1, 1)\n    if B.step &lt; 1:\n        B = range(B.stop + 1, B.start + 1, 1)\n\n    if len(A) == 0:\n        return range(0)\n    if len(B) == 0:\n        return range(0)\n\n    if A.stop &lt;= B.start:\n        return range(0)\n    if A.start &gt;= B.stop:\n        return range(0)\n\n    if A.start &lt;= B.start:\n        if A.stop &lt;= B.stop:\n            start, end = B.start, A.stop\n        elif A.stop &gt; B.stop:\n            start, end = B.start, B.stop\n        else:\n            raise ValueError(\"bad logic\")\n    elif A.start &lt; B.stop:\n        if A.stop &lt;= B.stop:\n            start, end = A.start, A.stop\n        elif A.stop &gt; B.stop:\n            start, end = A.start, B.stop\n        else:\n            raise ValueError(\"bad logic\")\n    else:\n        raise ValueError(\"bad logic\")\n\n    a_steps = math.ceil((start - A.start) / A.step)\n    a_start = (a_steps * A.step) + A.start\n\n    b_steps = math.ceil((start - B.start) / B.step)\n    b_start = (b_steps * B.step) + B.start\n\n    if A.step == 1 or B.step == 1:\n        start = max(a_start, b_start)\n        step = max(A.step, B.step)\n        return range(start, end, step)\n    elif A.step == B.step:\n        a, b = min(A.start, B.start), max(A.start, B.start)\n        if (b - a) % A.step != 0:  # then the ranges are offset.\n            return range(0)\n        else:\n            return range(b, end, step)\n    else:\n        # determine common step size:\n        step = max(A.step, B.step) if math.gcd(A.step, B.step) != 1 else A.step * B.step\n        # examples:\n        # 119 &lt;-- 17 if 1 != 1 else 119 &lt;-- max(7, 17) if math.gcd(7, 17) != 1 else 7 * 17\n        #  30 &lt;-- 30 if 3 != 1 else 90 &lt;-- max(3, 30) if math.gcd(3, 30) != 1 else 3*30\n        if A.step &lt; B.step:\n            for n in range(a_start, end, A.step):  # increment in smallest step to identify the first common value.\n                if n &lt; b_start:\n                    continue\n                elif (n - b_start) % B.step == 0:\n                    return range(n, end, step)  # common value found.\n        else:\n            for n in range(b_start, end, B.step):\n                if n &lt; a_start:\n                    continue\n                elif (n - a_start) % A.step == 0:\n                    return range(n, end, step)\n\n        return range(0)\n</code></pre>"},{"location":"reference/utils/#tablite.utils.summary_statistics","title":"<code>tablite.utils.summary_statistics(values, counts)</code>","text":"<p>values: any type counts: integer</p> <p>returns dict with: - min (int/float, length of str, date) - max (int/float, length of str, date) - mean (int/float, length of str, date) - median (int/float, length of str, date) - stdev (int/float, length of str, date) - mode (int/float, length of str, date) - distinct (number of distinct values) - iqr (int/float, length of str, date) - sum (int/float, length of str, date) - histogram (2 arrays: values, count of each values)</p> Source code in <code>tablite/utils.py</code> <pre><code>def summary_statistics(values, counts):\n    \"\"\"\n    values: any type\n    counts: integer\n\n    returns dict with:\n    - min (int/float, length of str, date)\n    - max (int/float, length of str, date)\n    - mean (int/float, length of str, date)\n    - median (int/float, length of str, date)\n    - stdev (int/float, length of str, date)\n    - mode (int/float, length of str, date)\n    - distinct (number of distinct values)\n    - iqr (int/float, length of str, date)\n    - sum (int/float, length of str, date)\n    - histogram (2 arrays: values, count of each values)\n    \"\"\"\n    # determine the dominant datatype:\n    dtypes = defaultdict(int)\n    most_frequent, most_frequent_dtype = 0, int\n    for v, c in zip(values, counts):\n        dtype = type(v)\n        total = dtypes[dtype] + c\n        dtypes[dtype] = total\n        if total &gt; most_frequent:\n            most_frequent_dtype = dtype\n            most_frequent = total\n\n    if most_frequent == 0:\n        return {}\n\n    most_frequent_dtype = max(dtypes, key=dtypes.get)\n    mask = [type(v) == most_frequent_dtype for v in values]\n    v = list(compress(values, mask))\n    c = list(compress(counts, mask))\n\n    f = summary_methods.get(most_frequent_dtype, int)\n    result = f(v, c)\n    result[\"distinct\"] = len(values)\n    result[\"summary type\"] = most_frequent_dtype.__name__\n    result[\"histogram\"] = [values, counts]\n    assert set(result.keys()) == required_keys, \"Key missing!\"\n    return result\n</code></pre>"},{"location":"reference/utils/#tablite.utils.date_range","title":"<code>tablite.utils.date_range(start, stop, step)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def date_range(start, stop, step):\n    if not isinstance(start, datetime):\n        raise TypeError(\"start is not datetime\")\n    if not isinstance(stop, datetime):\n        raise TypeError(\"stop is not datetime\")\n    if not isinstance(step, timedelta):\n        raise TypeError(\"step is not timedelta\")\n    n = (stop - start) // step\n    return [start + step * i for i in range(n)]\n</code></pre>"},{"location":"reference/utils/#tablite.utils.dict_to_rows","title":"<code>tablite.utils.dict_to_rows(d)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def dict_to_rows(d):\n    type_check(d, dict)\n    rows = []\n    max_length = max(len(i) for i in d.values())\n    order = list(d.keys())\n    rows.append(order)\n    for i in range(max_length):\n        row = [d[k][i] for k in order]\n        rows.append(row)\n    return rows\n</code></pre>"},{"location":"reference/utils/#tablite.utils.calc_col_count","title":"<code>tablite.utils.calc_col_count(letters: str)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def calc_col_count(letters: str):\n    ord_nil = ord(\"A\") - 1\n    cols_per_letter = ord(\"Z\") - ord_nil\n    col_count = 0\n\n    for i, v in enumerate(reversed(letters)):\n        col_count = col_count + (ord(v) - ord_nil) * pow(cols_per_letter, i)\n\n    return col_count\n</code></pre>"},{"location":"reference/utils/#tablite.utils.calc_true_dims","title":"<code>tablite.utils.calc_true_dims(sheet)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def calc_true_dims(sheet):\n    src = sheet._get_source()\n    max_col, max_row = 0, 0\n\n    regex = re.compile(\"\\d+\")\n\n    def handleStartElement(name, attrs):\n        nonlocal max_col, max_row\n\n        if name == \"c\":\n            last_index = attrs[\"r\"]\n            idx, _ = next(regex.finditer(last_index)).span()\n            letters, digits = last_index[0:idx], int(last_index[idx:])\n\n            col_idx, row_idx = calc_col_count(letters), digits\n\n            max_col, max_row = max(max_col, col_idx), max(max_row, row_idx)\n\n    parser = expat.ParserCreate()\n    parser.buffer_text = True\n    parser.StartElementHandler = handleStartElement\n    parser.ParseFile(src)\n\n    return max_col, max_row\n</code></pre>"},{"location":"reference/utils/#tablite.utils.fixup_worksheet","title":"<code>tablite.utils.fixup_worksheet(worksheet)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def fixup_worksheet(worksheet):\n    try:\n        ws_cols, ws_rows = calc_true_dims(worksheet)\n\n        worksheet._max_column = ws_cols\n        worksheet._max_row = ws_rows\n    except Exception as e:\n        logging.error(f\"Failed to fetch true dimensions: {e}\")\n</code></pre>"},{"location":"reference/utils/#tablite.utils.update_access_time","title":"<code>tablite.utils.update_access_time(path)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def update_access_time(path):\n    path = Path(path)\n    stat = path.stat()\n    os.utime(path, (now(), stat.st_mtime))\n</code></pre>"},{"location":"reference/utils/#tablite.utils.load_numpy","title":"<code>tablite.utils.load_numpy(path)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def load_numpy(path):\n    update_access_time(path)\n\n    return np.load(path, allow_pickle=True, fix_imports=False)\n</code></pre>"},{"location":"reference/utils/#tablite.utils.select_type_name","title":"<code>tablite.utils.select_type_name(dtypes: dict)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def select_type_name(dtypes: dict):\n    dtypes = [t for t in dtypes.items() if t[0] != NoneType]\n\n    if len(dtypes) == 0:\n        return \"empty\"\n\n    (best_type, _), *_ = sorted(dtypes, key=lambda t: t[1], reverse=True)\n\n    return best_type.__name__\n</code></pre>"},{"location":"reference/utils/#tablite.utils.get_predominant_types","title":"<code>tablite.utils.get_predominant_types(table, all_dtypes=None)</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def get_predominant_types(table, all_dtypes=None):\n    if all_dtypes is None:\n        all_dtypes = table.types()\n\n    dtypes = {\n        k: select_type_name(v)\n        for k, v in all_dtypes.items()\n    }\n\n    return dtypes\n</code></pre>"},{"location":"reference/utils/#tablite.utils.py_to_nim_encoding","title":"<code>tablite.utils.py_to_nim_encoding(encoding: str) -&gt; str</code>","text":"Source code in <code>tablite/utils.py</code> <pre><code>def py_to_nim_encoding(encoding: str) -&gt; str:\n    if encoding is None or encoding.lower() in [\"ascii\", \"utf8\", \"utf-8\", \"utf-8-sig\"]:\n        return \"ENC_UTF8\"\n    elif encoding.lower() in [\"utf16\", \"utf-16\"]:\n        return \"ENC_UTF16\"\n    elif encoding in Config.NIM_SUPPORTED_CONV_TYPES:\n        return f\"ENC_CONV|{encoding}\"\n\n    raise NotImplementedError(f\"encoding not implemented: {encoding}\")\n</code></pre>"},{"location":"reference/version/","title":"Version","text":""},{"location":"reference/version/#tablite.version","title":"<code>tablite.version</code>","text":""},{"location":"reference/version/#tablite.version-attributes","title":"Attributes","text":""},{"location":"reference/version/#tablite.version.__version_info__","title":"<code>tablite.version.__version_info__ = (major, minor, patch)</code>  <code>module-attribute</code>","text":""},{"location":"reference/version/#tablite.version.__version__","title":"<code>tablite.version.__version__ = '.'.join(str(i) for i in __version_info__)</code>  <code>module-attribute</code>","text":""}]}
\ No newline at end of file
diff --git a/master/sitemap.xml b/master/sitemap.xml
index 46423ca0..9b4aa0d0 100644
--- a/master/sitemap.xml
+++ b/master/sitemap.xml
@@ -2,147 +2,147 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://root-11.github.io/tablite/master/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/benchmarks/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/changelog/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/tutorial/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/base/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/config/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/core/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/datasets/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/datatypes/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/diff/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/export_utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/file_reader_utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/groupby_utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/import_utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/imputation/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/joins/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/lookup/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/match/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/merge/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/mp_utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/nimlite/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/pivots/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/redux/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/reindex/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/sort_utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/sortation/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/tools/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/utils/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://root-11.github.io/tablite/master/reference/version/</loc>
-         <lastmod>2024-04-10</lastmod>
+         <lastmod>2024-04-12</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/master/sitemap.xml.gz b/master/sitemap.xml.gz
index d14b302b..e6fc196c 100644
Binary files a/master/sitemap.xml.gz and b/master/sitemap.xml.gz differ
diff --git a/master/tablite/redux.py b/master/tablite/redux.py
index b922367e..826e4993 100644
--- a/master/tablite/redux.py
+++ b/master/tablite/redux.py
@@ -142,7 +142,7 @@ def _compress_one(T, mask):
     return new
 
 
-def _compress_both(T, mask, pbar: _tqdm):
+def compress_both(T, mask, pbar: _tqdm):
     # NOTE FOR DEVELOPERS:
     # np.compress is so fast that the overhead of multiprocessing doesn't pay off.
     cls = type(T)
@@ -161,30 +161,12 @@ def _compress_both(T, mask, pbar: _tqdm):
             data = T[name][start:end]
             true_col.extend(np.compress(mask[start:end], data))
             false_col.extend(np.compress(np.invert(mask)[start:end], data))
-            pbar.update(pbar_step)
+            if pbar is not None:
+                pbar.update(pbar_step)
     return true, false
 
-def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm):
-    """
-    enables filtering across columns for multiple criteria.
-
-    expressions:
-
-        str: Expression that can be compiled and executed row by row.
-            exampLe: "all((A==B and C!=4 and 200<D))"
-
-        list of dicts: (example):
 
-            L = [
-                {'column1':'A', 'criteria': "==", 'column2': 'B'},
-                {'column1':'C', 'criteria': "!=", "value2": '4'},
-                {'value1': 200, 'criteria': "<", column2: 'D' }
-            ]
-
-        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'
-
-    filter_type: 'all' or 'any'
-    """
+def get_filter_bitmap(T, expressions, pbar: _tqdm):
     for expression in expressions:
         if not isinstance(expression, dict):
             raise TypeError(f"invalid expression: {expression}")
@@ -213,11 +195,6 @@ def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm):
         if v2 is not None and c2 is not None:
             raise ValueError("filter can only take 1 right expression element. Got 2.")
 
-    if not isinstance(filter_type, str):
-        raise TypeError()
-    if filter_type not in {"all", "any"}:
-        raise ValueError(f"filter_type: {filter_type} not in ['all', 'any']")
-
     # EVALUATION....
     # 1. setup a rectangular bitmap for evaluations
     bitmap = np.empty(shape=(len(expressions), len(T)), dtype=bool)
@@ -277,7 +254,38 @@ def safe_test(f, a, b):
                 assert callable(f)
                 result = list_to_np_array([safe_test(f, a, b) for a, b in zip(dset_A, dset_B)])
             bitmap[bit_index, start:end] = result
-            pbar.update(pbar_step)
+            if pbar is not None:
+                pbar.update(pbar_step)
+
+    return bitmap
+
+def _filter_using_list_of_dicts(T, expressions, filter_type, pbar: _tqdm):
+    """
+    enables filtering across columns for multiple criteria.
+
+    expressions:
+
+        str: Expression that can be compiled and executed row by row.
+            exampLe: "all((A==B and C!=4 and 200<D))"
+
+        list of dicts: (example):
+
+            L = [
+                {'column1':'A', 'criteria': "==", 'column2': 'B'},
+                {'column1':'C', 'criteria': "!=", "value2": '4'},
+                {'value1': 200, 'criteria': "<", column2: 'D' }
+            ]
+
+        accepted dictionary keys: 'column1', 'column2', 'criteria', 'value1', 'value2'
+
+    filter_type: 'all' or 'any'
+    """
+    if not isinstance(filter_type, str):
+        raise TypeError()
+    if filter_type not in {"all", "any"}:
+        raise ValueError(f"filter_type: {filter_type} not in ['all', 'any']")
+
+    bitmap = get_filter_bitmap(T, expressions, pbar)
 
     f = np.all if filter_type == "all" else np.any
     mask = f(bitmap, axis=0)
@@ -334,7 +342,7 @@ def _f(A,B,C,D):
         else:
             raise TypeError
         # create new tables
-        res = _compress_both(T, mask, pbar=pbar)
+        res = compress_both(T, mask, pbar=pbar)
         pbar.update(pbar.total - pbar.n)
 
         return res
@@ -381,7 +389,7 @@ def _f(A,B,C,D):
             # TODO: make parser for expressions and use the nim implement
             mask = _filter_using_expression(T, expressions)
             pbar.update(10)
-            res = _compress_both(T, mask, pbar=pbar)
+            res = compress_both(T, mask, pbar=pbar)
             pbar.update(pbar.total - pbar.n)
     elif isinstance(expressions, list):
         return _filter_using_list_of_dicts_native(T, expressions, filter_type, tqdm)
diff --git a/master/tablite/version.py b/master/tablite/version.py
index 9db02ca2..5edfe41f 100644
--- a/master/tablite/version.py
+++ b/master/tablite/version.py
@@ -1,3 +1,3 @@
-major, minor, patch = 2023, 11, 2
+major, minor, patch = 2023, 11, 3
 __version_info__ = (major, minor, patch)
 __version__ = ".".join(str(i) for i in __version_info__)
diff --git a/versions.json b/versions.json
index a8da2aa4..b5c98921 100644
--- a/versions.json
+++ b/versions.json
@@ -2,14 +2,14 @@
   {
     "version": "master",
     "title": "master",
-    "aliases": []
+    "aliases": [
+      "latest"
+    ]
   },
   {
     "version": "2023.11.2",
     "title": "2023.11.2",
-    "aliases": [
-      "latest"
-    ]
+    "aliases": []
   },
   {
     "version": "2023.11.1",