forked from Blealtan/RWKV-LM-LoRA
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding series of modifications for early exploration
- Loading branch information
1 parent
7ac25dc
commit c66625e
Showing
13 changed files
with
934 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
token|speaker|ts|endTs|punctuation|prepunctuation|case|tags|oldTs|oldEndTs|ali_comment | ||
Uh|1|0.2100|0.3000|,||UC|[]||| | ||
today's|1|1.7400|2.1000|||LC|[]||| | ||
date|1|2.1000|2.2800|||LC|[]||| | ||
is|1|2.2800|2.4300|||LC|[]||| | ||
January|1|2.4300|2.8400|||UC|[]||| | ||
tenth|1|2.8400|3.1400|.||LC|['1:ORDINAL']||| | ||
It'|1|3.1400|3.2800|||UC|[]|||sub(<unk>) | ||
approximately|1|3.2800|3.7600|||LC|[]||| | ||
ten|1|3.7600|3.8500|||LC|['0:TIME']|||,push_last | ||
eleven|1|3.8500|4.1400|||LC|['0:TIME']|||,push_last | ||
in|1|4.1400|4.2100|||LC|[]||| | ||
the|1|4.2100|4.2700|||LC|[]||| | ||
morning|1|4.2700|4.7100|.||LC|[]||| | ||
we|1|5.0400|5.1500|||LC|['5:CONTRACTION']|||,push_last | ||
are|1|5.1500|5.2000|||LC|['5:CONTRACTION']|||,push_last | ||
here|1|5.2000|5.4500|||LC|[]||| | ||
with|1|5.4500|5.8100|||LC|[]||| | ||
Eve|1|6.1300|6.2800|||UC|[]||| | ||
Apodaca|1|6.2800|7.0100|.||UC|[]||| | ||
Uh|1|7.2900|7.5800|||UC|[]||| | ||
present|1|7.5800|7.9500|||LC|[]||| | ||
is|1|7.9500|8.1100|||LC|[]||| | ||
myself|1|8.1100|8.5600|,||LC|[]||| | ||
Quentin|1|8.9100|9.1700|||UC|[]||| | ||
Ray|1|9.2800|9.3400|,||UC|[]||| | ||
Andi|1|9.3400|9.5400|||UC|[]||| | ||
Reeb|1|9.5400|9.8600|,||UC|[]||| | ||
um|1|10.3100|10.6100|||LC|[]||| | ||
Dan|1|10.7800|11.0100|||UC|[]||| | ||
Aguilar|1|11.0100|11.5800|,||UC|[]||| | ||
Tye|1|11.8900|12.0700|||UC|[]||| | ||
Harmon|1|12.0700|12.4000|||UC|[]||| | ||
and|1|12.4000|12.7500|||LC|[]||| | ||
Norma|1|12.7500|12.9500|||UC|[]||| | ||
Delarosa|1|12.9500|13.5800|.||UC|[]||| | ||
And|1|17.1800|17.3300|||UC|[]||| | ||
we|1|17.3300|17.4000|||LC|['6:CONTRACTION']|||,push_last | ||
are|1|17.4000|17.4400|||LC|['6:CONTRACTION']|||,push_last | ||
here|1|17.4400|17.6200|||LC|[]||| | ||
for|1|17.6200|17.7800|||LC|[]||| | ||
a|1|||||LC|[]|||del | ||
pre-trial|1|||||LC|[]|||del | ||
interview|1|||.||LC|[]|||del | ||
So|1|||.||UC|[]|||del | ||
Hi|2|||||UC|[]|||del | ||
Eve|2|||.||UC|[]|||del | ||
Hi|3|||.||UC|[]|||del | ||
Thanks|2|||||UC|[]|||del | ||
for|2|||||LC|[]|||del |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
######################################################################################################## | ||
# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM | ||
######################################################################################################## | ||
|
||
import os, copy, types, gc, sys | ||
current_path = os.path.dirname(os.path.abspath(__file__)) | ||
sys.path.append(f'{current_path}/../rwkv_pip_package/src') | ||
import sys | ||
from transformers import PreTrainedTokenizerFast | ||
tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'./20B_tokenizer.json') | ||
# tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'./20B_tokenizer.jp.json') | ||
import tqdm | ||
|
||
import numpy as np | ||
from diskarray import DiskArray | ||
|
||
#sfrom diskarray import DiskArray | ||
#MODEL_NAME = '/data/workspaces/jp/LLMs/rwkv-4-raven/RWKV-4-Raven-3B-v9-Eng99%-Other1%-20230411-ctx4096.pth' | ||
#model = RWKV(model=MODEL_NAME, strategy=strategy) | ||
#pipeline = PIPELINE(model, f"{current_path}/20B_tokenizer.json") | ||
#dd = pipeline.encode(i) | ||
#xxx = pipeline.decode(model_tokens[out_last:]) | ||
|
||
# to|2|17.3400|17.4000|||LC|[]||| | ||
|
||
npy_file = sys.argv[1] | ||
# going strait to list of int | ||
# array = DiskArray(npy_file, dtype=np.uint16).data.astype("int") | ||
eot_id = tokenizer.encode("<|endoftext|>") | ||
array = DiskArray(npy_file, dtype=np.uint16) | ||
|
||
beg_of_doc = 0 | ||
end_of_doc = None | ||
i=0 | ||
num_docs = 0 | ||
# while end_of_doc is None and beg_of_doc+i < len(array): | ||
|
||
print("try 1") | ||
unique, counts = np.unique(array, return_counts=True) | ||
print(f"we have {len(unique)} unique tokens, end of text has a count of {counts[eot_id]}") | ||
|
||
with tqdm.tqdm("tokens processed", total=len(array)) as pbar: | ||
# just count the number of end-of-text for now | ||
while i < len(array): | ||
if array[i] == eot_id: | ||
num_docs += 1 | ||
|
||
if i % 50000 == 0: | ||
pbar.update(50000) | ||
pbar.set_postfix({"num_docs": num_docs}) | ||
# print(f"i: {i} num_docs: {num_docs}") | ||
|
||
# print docs | ||
# if array[beg_of_doc+i] == eot_id: | ||
# end_of_doc = beg_of_doc + i | ||
# print(f"doc between {beg_of_doc} and {end_of_doc} inclusively") | ||
# print("doc is") | ||
# print(tokenizer.decode(array[beg_of_doc:end_of_doc+1])) | ||
# print("-"*80) | ||
# m = input("find the next one ? y/n").strip().lower() | ||
# if m == "n": | ||
# break | ||
# beg_of_doc = end_of_doc + 1 | ||
# end_of_doc = None | ||
|
||
i += 1 | ||
print(f"num_docs: {num_docs}") | ||
|
||
# | ||
# print("len") | ||
# print(len(array)) | ||
# | ||
# for i in range(0, len(array), 100): | ||
# print(f"{i:4d} : {array[i:i+100]}") | ||
# # v = array[5000:5100] | ||
# # print(v) | ||
# # vstr = tokenizer.decode(v) | ||
# # print("vstr:", vstr[0:100]) | ||
# # print() | ||
# |
Oops, something went wrong.