fix typos (#3036)

LAION-AI · May 4, 2023 · 9bcc916 · 9bcc916
1 parent b509c2b
commit 9bcc916
Show file tree

Hide file tree

Showing 17 changed files with 20 additions and 20 deletions.
diff --git a/backend/export.py b/backend/export.py
@@ -232,7 +232,7 @@ def export_trees(
                     db,
                     message_tree_id=tree_id,
                     deleted=deleted,
-                    synthetic=None,  # pass None here (export trees, filtering happend in fetch_tree_ids)
+                    synthetic=None,  # pass None here (export trees, filtering happened in fetch_tree_ids)
                     prompts_only=prompts_only,
                     lang=None,  # pass None, trees were selected based on lang of prompt
                     review_result=review_result,
@@ -255,7 +255,7 @@ def export_trees(
                     db,
                     message_tree_id=tree_id,
                     deleted=deleted,
-                    synthetic=None,  # pass None here (export trees, filtering happend in fetch_tree_ids)
+                    synthetic=None,  # pass None here (export trees, filtering happened in fetch_tree_ids)
                     prompts_only=prompts_only,
                     lang=None,  # pass None here, trees were selected based on lang of prompt
                     review_result=review_result,

diff --git a/backend/oasst_backend/scheduled_tasks.py b/backend/oasst_backend/scheduled_tasks.py
@@ -94,7 +94,7 @@ def update_user_streak() -> None:
                             if lastactitvitydelta.days > 1 or user.streak_days is None:
                                 user.streak_days = 0
                                 user.streak_last_day_date = current_time
-                        # streak_last_day_date has a current timestamp in DB. Idealy should not be NULL.
+                        # streak_last_day_date has a current timestamp in DB. Ideally should not be NULL.
                         if streak_last_day_date is not None:
                             streak_delta = current_time - streak_last_day_date
                             # if user completed tasks on consecutive days then increment the streak days

diff --git a/backend/oasst_backend/utils/language_classification.py b/backend/oasst_backend/utils/language_classification.py
@@ -60,7 +60,7 @@ def main(foldername, modelname, num_words):
     save_model(clf, langmap, num_words, modelname)
     model = load(modelname)
     print(
-        "running infernence on long tests",
+        "running inference on long tests",
         inference_voter(
             model,
             """

diff --git a/data/datasets/nsfw_selfharm_reddit/utils/reddit.py b/data/datasets/nsfw_selfharm_reddit/utils/reddit.py
@@ -28,7 +28,7 @@ def init_praw_reddit(client_id: str | None = None, client_secret: str | None = N
 def scrap_subreddit(subreddit: str, reddit) -> pd.DataFrame | None:
     """
     Scrap "hot", "top", "rising" given a subreddit and return
-    dedupped DataFrame.
+    deduped DataFrame.
     """
     items = []
     dfs = []

diff --git a/data/datasets/safety_directory/child_help/child_help.py b/data/datasets/safety_directory/child_help/child_help.py
@@ -322,7 +322,7 @@
     "ChildHelp Sierra Leone": {
         "region": "Sierra Leone",
         "page": "https://childhelplineinternational.org/sierra-leone-childhelp-sierra-leone/",
-        "description": "ChildHelp Sierra Leone is designed specifically to meet both immediate and long-term needs of impoverished and underprivileged needy children, their families and communities in needand living in difficult circumstances.",
+        "description": "ChildHelp Sierra Leone is designed specifically to meet both immediate and long-term needs of impoverished and underprivileged needy children, their families and communities in need and living in difficult circumstances.",
         "contacts": {
             "Website": {"type": "website", "link": "http://www.childhelpsl.org/"},
             "+232 78 666269": {"type": "phone", "link": "tel:+23278666269"},
@@ -598,7 +598,7 @@
     "Ayudo pa mucha i hoben 918": {
         "region": "Curaçao",
         "page": "https://childhelplineinternational.org/curacao-ayudo-pa-mucha-i-hoben-918/",
-        "description": "Ayudo pa mucha i hoben 918 is free to contact via phone, chat, and forum. It is avaialble for all children, young people and young adults on Curaçao up to and including 25 years old.",
+        "description": "Ayudo pa mucha i hoben 918 is free to contact via phone, chat, and forum. It is available for all children, young people and young adults on Curaçao up to and including 25 years old.",
         "contacts": {
             "Website": {"type": "website", "link": "https://www.918.cw"},
             "918": {"type": "phone", "link": "tel:918"},
@@ -1223,7 +1223,7 @@
     "National Hotline for Child Protection 111": {
         "region": "Vietnam",
         "page": "https://childhelplineinternational.org/vietnam-national-hotline-for-child-protection-111/",
-        "description": "The Vietnamese National Hotline for Child Protection is run by the Department of Child Affairs, part of the Ministry of Labour, Invalids and Social Affairs. While the hotline deals primarily with information, reports and denuncations on risks and acts of child abuse, it also provides counselling to children.",
+        "description": "The Vietnamese National Hotline for Child Protection is run by the Department of Child Affairs, part of the Ministry of Labour, Invalids and Social Affairs. While the hotline deals primarily with information, reports and denunciations on risks and acts of child abuse, it also provides counselling to children.",
         "contacts": {
             "Website": {"type": "website", "link": "http://tongdai111.vn/"},
             "111": {"type": "phone", "link": "tel:111"},

diff --git a/model/model_eval/README.md b/model/model_eval/README.md
@@ -60,4 +60,4 @@ python model/model_eval/rejection_sampling.py --data_path model/model_eval/manua
 }
 ```
 
-- additionally, selected and rejected samples will be saved to seperate files
+- additionally, selected and rejected samples will be saved to separate files
diff --git a/model/model_eval/eval_rm.py b/model/model_eval/eval_rm.py
@@ -49,7 +49,7 @@ def batch_inference(inputs, model):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("--dataset", type=str, help="name of evaluation dataset")
-    parser.add_argument("--split", type=str, help="dataset splits seperated by comma", default="train")
+    parser.add_argument("--split", type=str, help="dataset splits separated by comma", default="train")
     parser.add_argument("--model", type=str, help="Path or url of the model file")
     parser.add_argument("--metrics", type=str, help="metrics to evaluate", default="accuracy")
     parser.add_argument("--batch_size", type=int, help="Batch Size", default=8)

diff --git a/model/model_training/check_dataset_counts.py b/model/model_training/check_dataset_counts.py
@@ -89,7 +89,7 @@ def argument_parsing(notebook=False, notebook_args=None):
                 raise ValueError(
                     f'Error: Could not find the dataset "{name}" in {mode.config_name()}. ',
                     f"Tried to look for this dataset within th key {mode.default_config()} ",
-                    "and as seperate key.",
+                    "and as separate key.",
                 )
 
             datasets_list.extend(datasets_value)

diff --git a/model/model_training/custom_datasets/formatting.py b/model/model_training/custom_datasets/formatting.py
@@ -135,7 +135,7 @@ def get_formatted(self, mode: Mode, eos_token: str, **kwargs) -> str | list[str]
 
     @classmethod
     def create_from_prompter_assistant_interplay(cls, qa: dict[str, str]):
-        """Creates a DatasetEntry from a qa of given structure. Even if qa contains consecutative assistant or prompter phrases.
+        """Creates a DatasetEntry from a qa of given structure. Even if qa contains consecutive assistant or prompter phrases.
 
 
         Returns:

diff --git a/model/model_training/custom_datasets/oasst_dataset.py b/model/model_training/custom_datasets/oasst_dataset.py
@@ -54,7 +54,7 @@ def load_oasst_export(
             if tree.tree_state not in ("ready_for_export", "prompt_lottery_waiting"):
                 continue
 
-        # extract all threads up to last asssitant reply
+        # extract all threads up to last assistant reply
         threads: list[list[ExportMessageNode]] = []
 
         def thread_filter(thread: list[ExportMessageNode]) -> bool:

diff --git a/model/model_training/models/gptj.py b/model/model_training/models/gptj.py
@@ -74,7 +74,7 @@ def __init__(self, weight, absmax, code):
 
     def forward(self, input, **kwargs):
         with torch.no_grad():
-            # note: both quantuized weights and input indices are *not* differentiable
+            # note: both quantized weights and input indices are *not* differentiable
             weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
             output = F.embedding(input, weight_deq, **kwargs)
         if self.adapter:

diff --git a/model/model_training/tools/augment_oasst.py b/model/model_training/tools/augment_oasst.py
@@ -9,7 +9,7 @@
 import json
 import os
 
-# so far load_oasst_export is pretty determinstic in thread order
+# so far load_oasst_export is pretty deterministic in thread order
 # means the train, val split stay the same
 from model_training.custom_datasets.oasst_dataset import load_oasst_export
 from model_training.models.reward_model import GPTNeoXRewardModel

diff --git a/model/model_training/trainer_rl.py b/model/model_training/trainer_rl.py
@@ -88,7 +88,7 @@ def reward_fn(samples, prompts, outputs):
         for i in range(math.ceil(len(samples) / mbs)):
             batch_ixs = slice(i * mbs, (i + 1) * mbs)
 
-            # We specififed int32 as types for a triton client
+            # We specified int32 as types for a triton client
             result = client.infer(
                 triton_model,
                 [

diff --git a/model/model_training/trainer_rm.py b/model/model_training/trainer_rm.py
@@ -163,7 +163,7 @@ def argument_parsing(notebook=False, notebook_args=None):
         conf["rng_seed"] = args.rng_seed
     conf["show_dataset_stats"] = args.show_dataset_stats
 
-    # get the world size in deeepspeed
+    # get the world size in deepspeed
     if conf["deepspeed"]:
         conf["world_size"] = int(os.getenv("WORLD_SIZE", default="1"))
     else:

diff --git a/model/model_training/trainer_sft.py b/model/model_training/trainer_sft.py
@@ -215,7 +215,7 @@ def argument_parsing(notebook=False, notebook_args=None):
         conf["rng_seed"] = args.rng_seed
     conf["show_dataset_stats"] = args.show_dataset_stats
 
-    # get the world size in deeepspeed
+    # get the world size in deepspeed
     if conf["deepspeed"]:
         conf["world_size"] = int(os.getenv("WORLD_SIZE", default="1"))
     else:

diff --git a/model/model_training/utils/ppo_utils.py b/model/model_training/utils/ppo_utils.py
@@ -553,7 +553,7 @@ def ref_model(all_tokens, attention_masks):
         for i in range(math.ceil(len(all_tokens) / mbs)):
             batch_ixs = slice(i * mbs, (i + 1) * mbs)
 
-            # We specififed int32 as types for a triton client
+            # We specified int32 as types for a triton client
             result = client.infer(
                 triton_model,
                 [

diff --git a/scripts/postprocessing/scoring.py b/scripts/postprocessing/scoring.py
@@ -101,7 +101,7 @@ def score_update_prompts(consensus: npt.ArrayLike, voter_data: Voter) -> Voter:
     # produces the ranking of votes, e.g. for [100,300,200] it returns [0, 2, 1],
     # since 100 is the lowest, 300 the highest and 200 the middle value
     consensus_ranking = np.arange(len(consensus)) - len(consensus) // 2 + 1
-    # expected consenus ranking (i.e. normalize the votes and multiply-sum with weightings)
+    # expected consensus ranking (i.e. normalize the votes and multiply-sum with weightings)
     delta_votes = np.sum(consensus_ranking * consensus / sum(consensus))
     new_points = delta_votes + voter_data.prompt_points
-Original file line number
+Diff line change
@@ Expand Up @@
     }
     ```
-    - additionally, selected and rejected samples will be saved to seperate files
+    - additionally, selected and rejected samples will be saved to separate files