Skip to content

Commit

Permalink
Nightly fixes (#213)
Browse files Browse the repository at this point in the history
* fixes

* add extra ENVS to use fork() method inside the container
  • Loading branch information
JegernOUTT authored and mitya52 committed Nov 17, 2023
1 parent fc9f078 commit 2280573
Show file tree
Hide file tree
Showing 19 changed files with 42 additions and 584 deletions.
13 changes: 10 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y \
expect \
mpich \
libmpich-dev \
python3 python3-pip python3-packaging \
python3 python3-pip \
&& rm -rf /var/lib/{apt,dpkg,cache,log}

RUN echo "export PATH=/usr/local/cuda/bin:\$PATH" > /etc/profile.d/50-smc.sh
Expand Down Expand Up @@ -39,16 +39,23 @@ RUN git clone https://github.com/smallcloudai/linguist.git /tmp/linguist \
&& rake build_gem
ENV PATH="${PATH}:/tmp/linguist/bin"

RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y python3-packaging

ENV INSTALL_OPTIONAL=TRUE
ENV BUILD_CUDA_EXT=1
ENV GITHUB_ACTIONS=true
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX"
ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
ENV MAX_JOBS=8
COPY . /tmp/app
RUN pip install /tmp/app && rm -rf /tmp/app
RUN pip install ninja
RUN pip install /tmp/app -v --no-build-isolation && rm -rf /tmp/app

ENV REFACT_PERM_DIR "/perm_storage"
ENV REFACT_TMP_DIR "/tmp"
ENV RDMAV_FORK_SAFE 0
ENV RDMAV_HUGEPAGES_SAFE 0

EXPOSE 8008

CMD ["python", "-m", "self_hosting_machinery.watchdog.docker_watchdog"]
CMD ["python", "-m", "self_hosting_machinery.watchdog.docker_watchdog"]
3 changes: 1 addition & 2 deletions code_contrast/format_2023q2/element.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import termcolor, re
from refact_encoding import RefactEncoding
from typing import List, Dict, Tuple, Callable, Type


class Format2023q2:
def __init__(self, enc: RefactEncoding):
def __init__(self, enc):
self.enc = enc
self.element_start_seq: Dict[str, List[int]] = {}
self.element_classes: Dict[str, Type[Element]] = {}
Expand Down
3 changes: 1 addition & 2 deletions code_contrast/format_2023q2/format.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from refact_encoding import RefactEncoding

from code_contrast.format_2023q2.element import Format2023q2
from code_contrast.format_2023q2.el_msg import MsgElement
from code_contrast.format_2023q2.el_chunk import ChunkElement


def format_2023q2_escape(enc: RefactEncoding) -> Format2023q2:
def format_2023q2_escape(enc) -> Format2023q2:
fmt = Format2023q2(enc)
fmt.element_start_seq = {
"SYSTEM": [enc.ESCAPE, *enc.encode("SYSTEM")],
Expand Down
3 changes: 1 addition & 2 deletions code_contrast/format_2023q2/packing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from refact_encoding import RefactEncoding, hlprint

from code_contrast.format_2023q2.element import Element, ElementPackingContext, Format2023q2
from typing import List, Optional
Expand All @@ -7,7 +6,7 @@
class Packer:
def __init__(self, fmt: Format2023q2):
self.fmt = fmt
self.enc: RefactEncoding = fmt.enc
self.enc = fmt.enc
self.r: List[int] = list()
self.m: List[int] = list()
self.plan: List[Element] = list()
Expand Down
9 changes: 6 additions & 3 deletions code_contrast/format_2023q2/test_2023q2.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import termcolor

from refact_encoding import RefactEncoding, hlprint

from collections import defaultdict

from typing import List, Dict, Tuple, DefaultDict, Any, Optional
Expand Down Expand Up @@ -231,8 +229,13 @@ def self_test(


if __name__ == "__main__":
enc = RefactEncoding("bigcode_largemodel")
raise NotImplementedError("This code will fail cause the tokenizer doesn't "
"have extra fields we had before"
"Should migrate tokenizer more porperly")
from transformers import AutoTokenizer
enc = AutoTokenizer.from_pretrained("bigcode_largemodel")
fmt = format.format_2023q2_escape(enc)

# test_messages(fmt)
# test_expansion(fmt)
self_test(fmt, example_odm, limit_ctx_n=1024, limit_aux_n=128, for_training=True, verbose=True)
Expand Down
1 change: 1 addition & 0 deletions refact_data_pipeline/code_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional, List

from kshingle import shingleseqs_list

from refact_data_pipeline import DatasetOpts
from refact_data_pipeline.utils.text_extraction import get_nl_ratio

Expand Down
17 changes: 9 additions & 8 deletions refact_data_pipeline/datadef.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@


class DatasetDef:
def __init__(self,
cloud_path: str,
cloud_files: List[str],
to_apply: Set[str]
def __init__(
self,
cloud_path: str,
cloud_files: List[str],
to_apply: Set[str]
):
self.cloud_path = cloud_path
self.cloud_files = cloud_files
Expand Down Expand Up @@ -39,9 +40,10 @@ def __repr__(self):


class DatasetMix:
def __init__(self,
dataset_defs: List[DatasetDef],
proportions: List[float] = [],
def __init__(
self,
dataset_defs: List[DatasetDef],
proportions: List[float] = [],
):
self.dataset_defs = dataset_defs
self.proportions = proportions
Expand Down Expand Up @@ -83,4 +85,3 @@ def assert_all_used(self):

def __repr__(self):
return json.dumps(self.opts)

4 changes: 2 additions & 2 deletions refact_data_pipeline/datautils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os

import torch as th
from collections import defaultdict
from typing import Iterator, Tuple, Dict, Any, Callable, Iterable, List

import torch as th

from refact_data_pipeline import DatasetOpts


Expand Down
3 changes: 1 addition & 2 deletions refact_data_pipeline/filters_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from code_contrast.format_2023q2.el_msg import MsgElement
from code_contrast.format_2023q2.element import Format2023q2
from refact_data_pipeline import DatasetOpts
from refact_encoding.encoding import RefactEncoding


class Chat2023Q2:
Expand All @@ -22,7 +21,7 @@ def __init__(
self.no_format_prob = dataopts.get("chat_no_format_prob", 0.0)
self.debug = bool(dataopts.get("debug", 0))
self.tkr_stochastic_tokens = bool(dataopts.get("tkr_stochastic_tokens", 0.0))
self.enc: RefactEncoding = dataopts.encoding
self.enc = dataopts.encoding
self.fmt: Format2023q2 = format.format_2023q2_escape(self.enc)
self.random = np.random.RandomState(dataopts.get("seed", 42))

Expand Down
107 changes: 0 additions & 107 deletions refact_data_pipeline/filters_diff.py

This file was deleted.

3 changes: 1 addition & 2 deletions refact_data_pipeline/filters_diff2023q2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import numpy as np

from refact_encoding import RefactEncoding
from code_contrast.format_2023q2.element import Format2023q2
from code_contrast.format_2023q2 import format, packing
from code_contrast.format_2023q2.from_orig_dest_message import from_odm_dict
Expand All @@ -24,7 +23,7 @@ def __init__(self,
self.seed = dataopts.get("seed", 42)
self.py_random = random.Random(self.seed if self.seed else None)
self.np_random = np.random.RandomState(self.seed if self.seed else None)
self.enc: RefactEncoding = dataopts.encoding
self.enc = dataopts.encoding
self.fmt: Format2023q2 = format.format_2023q2_escape(self.enc)

def __iter__(self):
Expand Down
76 changes: 0 additions & 76 deletions refact_data_pipeline/filters_diff_valid.py

This file was deleted.

3 changes: 1 addition & 2 deletions refact_data_pipeline/filters_fim.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import random

from refact_encoding import RefactEncoding
from refact_data_pipeline import DatasetOpts

from typing import Dict, Union
Expand Down Expand Up @@ -48,7 +47,7 @@ def __init__(
self.n_ctx = dataopts.get("n_ctx", 2048)
self.fim_probability = dataopts.get("fim_probability", 0.5)
self.tkr_stochastic_tokens = dataopts.get("tkr_stochastic_tokens", 3)
self.enc: RefactEncoding = dataopts.encoding
self.enc = dataopts.encoding
if hasattr(self.enc, "set_random_seed"):
self.enc.set_random_seed(dataopts.get("seed", 42))
self.special_tokens = [
Expand Down
Loading

0 comments on commit 2280573

Please sign in to comment.