Skip to content

Commit

Permalink
No public description
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 704092684
  • Loading branch information
agutkin committed Dec 9, 2024
1 parent 97e9e60 commit 1d492bb
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 77 deletions.
48 changes: 9 additions & 39 deletions protoscribe/corpus/builder/build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,16 @@

import logging
import os
import shutil
from typing import Any

from absl import flags
from protoscribe.utils import file_utils
from protoscribe.utils import subprocess_utils

import glob
import os

_SRC_DIR = "protoscribe"
_RESOURCE_DIR = "protoscribe"
_RESOURCE_DIR = file_utils.RESOURCE_DIR
_TEXT_GENERATOR = f"{_RESOURCE_DIR}/texts/generate"
_PHONETIC_EMBEDDINGS_BUILDER = (
f"{_RESOURCE_DIR}/language/phonology/build_phonetic_embeddings"
Expand All @@ -67,13 +66,13 @@

_ADMINISTRATIVE_CATEGORIES = flags.DEFINE_string(
"administrative_categories",
f"{_SRC_DIR}/data/concepts/administrative_categories.txt",
f"{file_utils.SRC_DIR}/data/concepts/administrative_categories.txt",
"Path to administrative categories to use."
)

_NON_ADMINISTRATIVE_CATEGORIES = flags.DEFINE_string(
"non_administrative_categories",
f"{_SRC_DIR}/data/concepts/non_administrative_categories.txt",
f"{file_utils.SRC_DIR}/data/concepts/non_administrative_categories.txt",
"Path to non-administrative categories to use."
)

Expand Down Expand Up @@ -108,7 +107,7 @@
"MORPHEME_CVCC_MONO"
],
"Morpheme shape. For available values see "
f"{_SRC_DIR}/texts/common_configs.py."
f"{file_utils.SRC_DIR}/texts/common_configs.py."
)

_NUMBER_CONFIG = flags.DEFINE_string(
Expand Down Expand Up @@ -202,35 +201,6 @@
)


def _src_file(path: str) -> str:
"""Returns full path for the source file."""
return os.path.join(_SRC_DIR, path)


def _copy_file(src_path: str, dst_path: str) -> None:
"""Helper for file copying."""
logging.info("Copying %s -> %s ...", src_path, dst_path)
shutil.copy2(src_path, dst_path)


def _copy_src_file(dir_name: str, file_name: str, language_dir: str) -> None:
"""Copy a source file to a language directory."""
src_path = _src_file(os.path.join(dir_name, file_name))
dst_path = os.path.join(language_dir, file_name)
_copy_file(src_path, dst_path)


def _copy_full_path(file_path: str, language_dir: str) -> None:
"""Copies a file provided by the full path to language directory."""

full_file_path = os.path.join(os.getcwd(), file_path)
_copy_src_file(
dir_name=os.path.dirname(full_file_path),
file_name=os.path.basename(full_file_path),
language_dir=language_dir
)


def _output_file(filename: str) -> str:
return os.path.join(_OUTPUT_DIR.value, filename)

Expand Down Expand Up @@ -271,16 +241,16 @@ def _prepare_language_components() -> None:
os.makedirs(language_dir, exist_ok=True)

# Copy these files that are copyable verbatim.
_copy_src_file("texts/configs", _NUMBER_CONFIG.value, language_dir)
_copy_full_path(_ADMINISTRATIVE_CATEGORIES.value, output_dir)
_copy_full_path(_NON_ADMINISTRATIVE_CATEGORIES.value, output_dir)
file_utils.copy_src_file("texts/configs", _NUMBER_CONFIG.value, language_dir)
file_utils.copy_full_path(_ADMINISTRATIVE_CATEGORIES.value, output_dir)
file_utils.copy_full_path(_NON_ADMINISTRATIVE_CATEGORIES.value, output_dir)
all_concept_paths = [
_output_file("administrative_categories.txt"),
_output_file("non_administrative_categories.txt"),
]
excluded_concepts_path = _excluded_concepts_path()
if excluded_concepts_path:
_copy_file(_EXCLUDE_CONCEPTS_FILE.value, excluded_concepts_path)
file_utils.copy_file(_EXCLUDE_CONCEPTS_FILE.value, excluded_concepts_path)

# Generate the actual core language and embeddings files.
common_language_args = _common_language_args()
Expand Down
6 changes: 3 additions & 3 deletions protoscribe/corpus/reader/dataset_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from protoscribe.glyphs import glyph_vocab as glyph_lib
from protoscribe.sketches.utils import stroke_stats as stroke_stats_lib
from protoscribe.sketches.utils import stroke_tokenizer as tokenizer_lib
from protoscribe.utils import file_utils

import glob
import os
# Internal resources dependency

StrokeStats = stroke_stats_lib.FinalStrokeStats
StrokeTokenizer = tokenizer_lib.StrokeTokenizer
Expand Down Expand Up @@ -109,8 +109,8 @@ def phonetic_embeddings_file() -> str:

def stroke_tokenizer_file(file_name: str) -> str:
"""Stroke quantizer file for tokenizing sketches."""
return os.path.join(
os.getcwd(), _STROKE_TOKENIZER_ROOT_DIR, file_name
return file_utils.resource_path(
os.path.join(_STROKE_TOKENIZER_ROOT_DIR, file_name)
)


Expand Down
12 changes: 3 additions & 9 deletions protoscribe/evolution/make_html_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
import csv
import logging
import os
import shutil

from absl import app
from absl import flags
from protoscribe.utils import file_utils

import glob
import os
Expand Down Expand Up @@ -49,10 +49,6 @@
"Height of the glyph images (in pixels)."
)

_NUM_COPY_WORKERS = flags.DEFINE_integer(
"num_copy_workers", -1,
"Number of workers when performing parallel copy."
)

# HTML header.
_HTML_HEADER = """<!DOCTYPE html>
Expand Down Expand Up @@ -140,11 +136,9 @@ def _copy_svgs(concepts: set[str], source_dir: str, target_dir: str) -> None:
source_path = os.path.join(source_dir, filename)
if not os.path.exists(source_path):
raise FileNotFoundError(f"Source SVG {source_path} not found")
paths.append((source_path, os.path.join(target_dir, filename)))
paths.append(source_path)

for source_path, target_path in paths:
logging.info("Copying %s -> %s ...", source_path, target_path)
shutil.copy(source_path, target_path)
file_utils.copy_files(paths, target_dir)


def main(unused_argv):
Expand Down
15 changes: 5 additions & 10 deletions protoscribe/texts/generate_simple_corpus_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,14 @@

from collections.abc import Sequence
import logging
import os
import shutil

from absl import app
from absl import flags
from protoscribe.utils import file_utils
from protoscribe.utils import subprocess_utils

import glob
import os
# Internal resources dependency

_DATASET_DIR = flags.DEFINE_string(
"dataset_dir", None,
Expand Down Expand Up @@ -72,8 +70,8 @@
"Probability of generating a supercategory glyph if one is available."
)

_SRC_DIR = "protoscribe"
_RESOURCE_DIR = "protoscribe"
_SRC_DIR = file_utils.SRC_DIR
_RESOURCE_DIR = file_utils.RESOURCE_DIR
_TEXT_GENERATOR = f"{_RESOURCE_DIR}/texts/generate"
_VOCAB_BUILDER = f"{_RESOURCE_DIR}/texts/make_vocab_files"
_PHONETIC_EMBEDDINGS_BUILDER = (
Expand Down Expand Up @@ -166,11 +164,8 @@ def main(argv: Sequence[str]) -> None:
logging.info("Copying semantic embeddings ...")
sem_dir = f"{_RESOURCE_DIR}/data/semantics/bnc"
for filename in ["embeddings.txt", "numbers.txt"]:
src_file = os.path.join(
os.getcwd(), f"{sem_dir}/{filename}"
)
dst_file = f"{params_dir}/{filename}"
shutil.copy2(src_file, dst_file)
src_file = file_utils.resource_path(f"{sem_dir}/{filename}")
file_utils.copy_full_path(src_file, params_dir)

logging.info("Initial corpus generated in %s.", initial_dir)

Expand Down
109 changes: 109 additions & 0 deletions protoscribe/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright 2024 The Protoscribe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Miscellaneous file-related utilities."""

import logging
import os
import shutil

from absl import flags

import glob
import os
# Internal resources dependency

_NUM_COPY_WORKERS = flags.DEFINE_integer(
"num_copy_workers", -1,
"Number of workers when performing parallel copy."
)

# Source directory for all the code and data.
SRC_DIR = "protoscribe"
RESOURCE_DIR = "protoscribe"


def resource_path(path: str) -> str:
"""Returns fully qualified path for the given resource.
Args:
path: Path to resource.
Returns:
Full path.
"""
return os.path.join(os.getcwd(), path)


def src_file(path: str) -> str:
"""Returns full path for the source file."""
return os.path.join(SRC_DIR, path)


def copy_file(src_path: str, dst_path: str) -> None:
"""Copies full src path to destination path.
Args:
src_path: Fully-qualified source path.
dst_path: Fully-qualified destination path.
"""
logging.info("Copying %s -> %s ...", src_path, dst_path)
shutil.copy2(src_path, dst_path)


def copy_src_file(source_dir: str, file_name: str, output_dir: str) -> None:
"""Copy a source file to a target directory.
Args:
source_dir: Source directory.
file_name: File name or path in a `source_dir` to copy.
output_dir: Target directory.
"""
src_path = src_file(os.path.join(source_dir, file_name))
dst_path = os.path.join(output_dir, file_name)
copy_file(src_path, dst_path)


def copy_full_path(file_path: str, output_dir: str) -> None:
"""Copies a file provided by the full path to target directory.
Args:
file_path: Fully-qualified file path.
output_dir: Output directory.
"""
full_file_path = os.path.join(os.getcwd(), file_path)
copy_src_file(
source_dir=os.path.dirname(full_file_path),
file_name=os.path.basename(full_file_path),
output_dir=output_dir
)


def copy_files(paths: list[str], target_dir: str) -> None:
"""Copies files to a target directory.
Args:
paths: List of file paths.
target_dir: Target directory for copying.
"""
logging.info("Copying %d files to %s ...", len(paths), target_dir)
paths = [
(
path,
os.path.join(target_dir, os.path.basename(path))
) for path in paths
]
for source_path, target_path in paths:
logging.info("Copying %s -> %s ...", source_path, target_path)
shutil.copy(source_path, target_path)
22 changes: 6 additions & 16 deletions protoscribe/utils/subprocess_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,12 @@
"""Utilities for executing subprocesses."""

import logging
import os
import subprocess
import sys
from typing import Any
import os
import sys

# Internal resources dependency


def _get_resource_path(path: str) -> str:
"""Returns fully qualified path for the given resource.
Args:
path: Path to resource.
Returns:
Full path.
"""
return os.path.join(os.getcwd(), path)
from protoscribe.utils import file_utils


def _val_to_string(value: Any) -> str:
Expand Down Expand Up @@ -63,7 +51,9 @@ def run_subprocess(exec_path: str, args: list[Any]) -> None:
)

# Determine the process to execute.
exec_args = [sys.executable, _get_resource_path(f"{exec_path}_main.py")]
exec_args = [
sys.executable, file_utils.resource_path(f"{exec_path}_main.py")
]

# Makes sure that all elements of the process' argument list are
# in `name=value` format.
Expand Down

0 comments on commit 1d492bb

Please sign in to comment.