Skip to content

Commit

Permalink
adding series of modifications for early exploration
Browse files Browse the repository at this point in the history
  • Loading branch information
jprobichaud committed Apr 20, 2023
1 parent 7ac25dc commit c66625e
Show file tree
Hide file tree
Showing 13 changed files with 934 additions and 2 deletions.
50 changes: 50 additions & 0 deletions RWKV-v4neo/50lines.nlp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|oldTs|oldEndTs|ali_comment
Uh|1|0.2100|0.3000|,||UC|[]|||
today's|1|1.7400|2.1000|||LC|[]|||
date|1|2.1000|2.2800|||LC|[]|||
is|1|2.2800|2.4300|||LC|[]|||
January|1|2.4300|2.8400|||UC|[]|||
tenth|1|2.8400|3.1400|.||LC|['1:ORDINAL']|||
It'|1|3.1400|3.2800|||UC|[]|||sub(<unk>)
approximately|1|3.2800|3.7600|||LC|[]|||
ten|1|3.7600|3.8500|||LC|['0:TIME']|||,push_last
eleven|1|3.8500|4.1400|||LC|['0:TIME']|||,push_last
in|1|4.1400|4.2100|||LC|[]|||
the|1|4.2100|4.2700|||LC|[]|||
morning|1|4.2700|4.7100|.||LC|[]|||
we|1|5.0400|5.1500|||LC|['5:CONTRACTION']|||,push_last
are|1|5.1500|5.2000|||LC|['5:CONTRACTION']|||,push_last
here|1|5.2000|5.4500|||LC|[]|||
with|1|5.4500|5.8100|||LC|[]|||
Eve|1|6.1300|6.2800|||UC|[]|||
Apodaca|1|6.2800|7.0100|.||UC|[]|||
Uh|1|7.2900|7.5800|||UC|[]|||
present|1|7.5800|7.9500|||LC|[]|||
is|1|7.9500|8.1100|||LC|[]|||
myself|1|8.1100|8.5600|,||LC|[]|||
Quentin|1|8.9100|9.1700|||UC|[]|||
Ray|1|9.2800|9.3400|,||UC|[]|||
Andi|1|9.3400|9.5400|||UC|[]|||
Reeb|1|9.5400|9.8600|,||UC|[]|||
um|1|10.3100|10.6100|||LC|[]|||
Dan|1|10.7800|11.0100|||UC|[]|||
Aguilar|1|11.0100|11.5800|,||UC|[]|||
Tye|1|11.8900|12.0700|||UC|[]|||
Harmon|1|12.0700|12.4000|||UC|[]|||
and|1|12.4000|12.7500|||LC|[]|||
Norma|1|12.7500|12.9500|||UC|[]|||
Delarosa|1|12.9500|13.5800|.||UC|[]|||
And|1|17.1800|17.3300|||UC|[]|||
we|1|17.3300|17.4000|||LC|['6:CONTRACTION']|||,push_last
are|1|17.4000|17.4400|||LC|['6:CONTRACTION']|||,push_last
here|1|17.4400|17.6200|||LC|[]|||
for|1|17.6200|17.7800|||LC|[]|||
a|1|||||LC|[]|||del
pre-trial|1|||||LC|[]|||del
interview|1|||.||LC|[]|||del
So|1|||.||UC|[]|||del
Hi|2|||||UC|[]|||del
Eve|2|||.||UC|[]|||del
Hi|3|||.||UC|[]|||del
Thanks|2|||||UC|[]|||del
for|2|||||LC|[]|||del
80 changes: 80 additions & 0 deletions RWKV-v4neo/inspect-npy-array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
########################################################################################################
# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
########################################################################################################

import os, copy, types, gc, sys
current_path = os.path.dirname(os.path.abspath(__file__))
sys.path.append(f'{current_path}/../rwkv_pip_package/src')
import sys
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'./20B_tokenizer.json')
# tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'./20B_tokenizer.jp.json')
import tqdm

import numpy as np
from diskarray import DiskArray

#sfrom diskarray import DiskArray
#MODEL_NAME = '/data/workspaces/jp/LLMs/rwkv-4-raven/RWKV-4-Raven-3B-v9-Eng99%-Other1%-20230411-ctx4096.pth'
#model = RWKV(model=MODEL_NAME, strategy=strategy)
#pipeline = PIPELINE(model, f"{current_path}/20B_tokenizer.json")
#dd = pipeline.encode(i)
#xxx = pipeline.decode(model_tokens[out_last:])

# to|2|17.3400|17.4000|||LC|[]|||

npy_file = sys.argv[1]
# going strait to list of int
# array = DiskArray(npy_file, dtype=np.uint16).data.astype("int")
eot_id = tokenizer.encode("<|endoftext|>")
array = DiskArray(npy_file, dtype=np.uint16)

beg_of_doc = 0
end_of_doc = None
i=0
num_docs = 0
# while end_of_doc is None and beg_of_doc+i < len(array):

print("try 1")
unique, counts = np.unique(array, return_counts=True)
print(f"we have {len(unique)} unique tokens, end of text has a count of {counts[eot_id]}")

with tqdm.tqdm("tokens processed", total=len(array)) as pbar:
# just count the number of end-of-text for now
while i < len(array):
if array[i] == eot_id:
num_docs += 1

if i % 50000 == 0:
pbar.update(50000)
pbar.set_postfix({"num_docs": num_docs})
# print(f"i: {i} num_docs: {num_docs}")

# print docs
# if array[beg_of_doc+i] == eot_id:
# end_of_doc = beg_of_doc + i
# print(f"doc between {beg_of_doc} and {end_of_doc} inclusively")
# print("doc is")
# print(tokenizer.decode(array[beg_of_doc:end_of_doc+1]))
# print("-"*80)
# m = input("find the next one ? y/n").strip().lower()
# if m == "n":
# break
# beg_of_doc = end_of_doc + 1
# end_of_doc = None

i += 1
print(f"num_docs: {num_docs}")

#
# print("len")
# print(len(array))
#
# for i in range(0, len(array), 100):
# print(f"{i:4d} : {array[i:i+100]}")
# # v = array[5000:5100]
# # print(v)
# # vstr = tokenizer.decode(v)
# # print("vstr:", vstr[0:100])
# # print()
#
Loading

0 comments on commit c66625e

Please sign in to comment.