adding series of modifications for early exploration

revdotcom · Apr 20, 2023 · c66625e · c66625e
1 parent 7ac25dc
commit c66625e
Show file tree

Hide file tree

Showing 13 changed files with 934 additions and 2 deletions.
diff --git a/RWKV-v4neo/50lines.nlp b/RWKV-v4neo/50lines.nlp
@@ -0,0 +1,50 @@
+token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|oldTs|oldEndTs|ali_comment
+Uh|1|0.2100|0.3000|,||UC|[]|||
+today's|1|1.7400|2.1000|||LC|[]|||
+date|1|2.1000|2.2800|||LC|[]|||
+is|1|2.2800|2.4300|||LC|[]|||
+January|1|2.4300|2.8400|||UC|[]|||
+tenth|1|2.8400|3.1400|.||LC|['1:ORDINAL']|||
+It'|1|3.1400|3.2800|||UC|[]|||sub(<unk>)
+approximately|1|3.2800|3.7600|||LC|[]|||
+ten|1|3.7600|3.8500|||LC|['0:TIME']|||,push_last
+eleven|1|3.8500|4.1400|||LC|['0:TIME']|||,push_last
+in|1|4.1400|4.2100|||LC|[]|||
+the|1|4.2100|4.2700|||LC|[]|||
+morning|1|4.2700|4.7100|.||LC|[]|||
+we|1|5.0400|5.1500|||LC|['5:CONTRACTION']|||,push_last
+are|1|5.1500|5.2000|||LC|['5:CONTRACTION']|||,push_last
+here|1|5.2000|5.4500|||LC|[]|||
+with|1|5.4500|5.8100|||LC|[]|||
+Eve|1|6.1300|6.2800|||UC|[]|||
+Apodaca|1|6.2800|7.0100|.||UC|[]|||
+Uh|1|7.2900|7.5800|||UC|[]|||
+present|1|7.5800|7.9500|||LC|[]|||
+is|1|7.9500|8.1100|||LC|[]|||
+myself|1|8.1100|8.5600|,||LC|[]|||
+Quentin|1|8.9100|9.1700|||UC|[]|||
+Ray|1|9.2800|9.3400|,||UC|[]|||
+Andi|1|9.3400|9.5400|||UC|[]|||
+Reeb|1|9.5400|9.8600|,||UC|[]|||
+um|1|10.3100|10.6100|||LC|[]|||
+Dan|1|10.7800|11.0100|||UC|[]|||
+Aguilar|1|11.0100|11.5800|,||UC|[]|||
+Tye|1|11.8900|12.0700|||UC|[]|||
+Harmon|1|12.0700|12.4000|||UC|[]|||
+and|1|12.4000|12.7500|||LC|[]|||
+Norma|1|12.7500|12.9500|||UC|[]|||
+Delarosa|1|12.9500|13.5800|.||UC|[]|||
+And|1|17.1800|17.3300|||UC|[]|||
+we|1|17.3300|17.4000|||LC|['6:CONTRACTION']|||,push_last
+are|1|17.4000|17.4400|||LC|['6:CONTRACTION']|||,push_last
+here|1|17.4400|17.6200|||LC|[]|||
+for|1|17.6200|17.7800|||LC|[]|||
+a|1|||||LC|[]|||del
+pre-trial|1|||||LC|[]|||del
+interview|1|||.||LC|[]|||del
+So|1|||.||UC|[]|||del
+Hi|2|||||UC|[]|||del
+Eve|2|||.||UC|[]|||del
+Hi|3|||.||UC|[]|||del
+Thanks|2|||||UC|[]|||del
+for|2|||||LC|[]|||del
diff --git a/RWKV-v4neo/inspect-npy-array.py b/RWKV-v4neo/inspect-npy-array.py
@@ -0,0 +1,80 @@
+########################################################################################################
+# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
+########################################################################################################
+
+import os, copy, types, gc, sys
+current_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(f'{current_path}/../rwkv_pip_package/src')
+import sys
+from transformers import PreTrainedTokenizerFast
+tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'./20B_tokenizer.json')
+# tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'./20B_tokenizer.jp.json')
+import tqdm
+
+import numpy as np
+from diskarray import DiskArray
+
+#sfrom diskarray import DiskArray
+#MODEL_NAME = '/data/workspaces/jp/LLMs/rwkv-4-raven/RWKV-4-Raven-3B-v9-Eng99%-Other1%-20230411-ctx4096.pth'
+#model = RWKV(model=MODEL_NAME, strategy=strategy)
+#pipeline = PIPELINE(model, f"{current_path}/20B_tokenizer.json")
+#dd = pipeline.encode(i)
+#xxx = pipeline.decode(model_tokens[out_last:])
+
+# to|2|17.3400|17.4000|||LC|[]|||
+
+npy_file = sys.argv[1]
+# going strait to list of int
+# array = DiskArray(npy_file, dtype=np.uint16).data.astype("int")
+eot_id = tokenizer.encode("<|endoftext|>")
+array = DiskArray(npy_file, dtype=np.uint16)
+
+beg_of_doc = 0
+end_of_doc = None
+i=0
+num_docs = 0
+# while end_of_doc is None and beg_of_doc+i < len(array):
+
+print("try 1")
+unique, counts = np.unique(array, return_counts=True)
+print(f"we have {len(unique)} unique tokens, end of text has a count of {counts[eot_id]}")
+
+with tqdm.tqdm("tokens processed", total=len(array)) as pbar:
+    # just count the number of end-of-text for now
+    while i < len(array):
+        if array[i] == eot_id:
+            num_docs += 1
+
+        if i % 50000 == 0:
+            pbar.update(50000)
+            pbar.set_postfix({"num_docs": num_docs})
+            # print(f"i: {i} num_docs: {num_docs}")
+
+        # print docs
+        # if array[beg_of_doc+i] == eot_id:
+        #     end_of_doc = beg_of_doc + i
+        #     print(f"doc between {beg_of_doc} and {end_of_doc} inclusively")
+        #     print("doc is")
+        #     print(tokenizer.decode(array[beg_of_doc:end_of_doc+1]))
+        #     print("-"*80)
+        #     m = input("find the next one ? y/n").strip().lower()
+        #     if m == "n":
+        #         break
+        #     beg_of_doc = end_of_doc + 1
+        #     end_of_doc = None
+
+        i += 1
+print(f"num_docs: {num_docs}")
+
+# 
+# print("len")
+# print(len(array))
+# 
+# for i in range(0, len(array), 100):
+#     print(f"{i:4d} : {array[i:i+100]}")
+# # v = array[5000:5100]
+# # print(v)
+# # vstr = tokenizer.decode(v)
+# # print("vstr:", vstr[0:100])
+# # print()
+#