From 0f444f34e9b0deff6e3399a094dddc207053106c Mon Sep 17 00:00:00 2001 From: Alexander Gutkin Date: Sun, 8 Dec 2024 12:47:17 +0000 Subject: [PATCH] Simple textual corpus generation tool. PiperOrigin-RevId: 703986758 --- .../texts/generate_simple_corpus_main.py | 179 ++++++++++++++++++ protoscribe/texts/make_vocab_files_main.py | 4 +- 2 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 protoscribe/texts/generate_simple_corpus_main.py diff --git a/protoscribe/texts/generate_simple_corpus_main.py b/protoscribe/texts/generate_simple_corpus_main.py new file mode 100644 index 0000000..10b9036 --- /dev/null +++ b/protoscribe/texts/generate_simple_corpus_main.py @@ -0,0 +1,179 @@ +# Copyright 2024 The Protoscribe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Helper tool for generating simple (textual-only) data for model training. + +Example: +-------- +Generates the initial data using defaults. + +python protoscribe/texts/generate_simple_corpus_main.py \ + --dataset_dir /tmp/protoscribe \ + --logtostderr +""" + +from collections.abc import Sequence +import logging +import os +import shutil + +from absl import app +from absl import flags +from protoscribe.utils import subprocess_utils + +import glob +import os +# Internal resources dependency + +_DATASET_DIR = flags.DEFINE_string( + "dataset_dir", None, + "Parent directory for the dataset.", + required=True +) + +_MAX_HOMOPHONY = flags.DEFINE_integer( + "max_homophony", 5, + "Maximum amount of homophony." +) + +_NUMBER_CONFIG = flags.DEFINE_string( + "number_config", "number_config_sg_du_pl.textproto", + "Number generation configuration." +) + +_NUM_SETS = flags.DEFINE_integer( + "num_sets", 5, + "Number of sets. of accounting texts to generate." +) + +_NUM_TEXTS = flags.DEFINE_integer( + "num_texts", 10_000, + "Number of accounting documents to generate." +) + +_MAX_COMMODITY = flags.DEFINE_integer( + "max_commodity", 99, + "Maximum cardinal representing the number of commodities." +) + +_PROBABILITY_OF_SUPERCATEGORY_GLYPH = flags.DEFINE_float( + "probability_of_supercategory_glyph", 0.25, + "Probability of generating a supercategory glyph if one is available." +) + +_SRC_DIR = "protoscribe" +_RESOURCE_DIR = "protoscribe" +_TEXT_GENERATOR = f"{_RESOURCE_DIR}/texts/generate" +_VOCAB_BUILDER = f"{_RESOURCE_DIR}/texts/make_vocab_files" +_PHONETIC_EMBEDDINGS_BUILDER = ( + f"{_RESOURCE_DIR}/language/phonology/build_phonetic_embeddings" +) + + +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + + # Prepare output directories. + initial_dir = f"{_DATASET_DIR.value}/initial_texts" + params_dir = f"{initial_dir}/params" + concepts_dir = f"{_SRC_DIR}/data/concepts" + if not os.path.exists(params_dir): + os.makedirs(params_dir, exist_ok=True) + + # Generate lexicon resources. + logging.info("Generating the lexicon with ALL concepts in %s ...", params_dir) + concept_files = [ + f"{concepts_dir}/administrative_categories.txt", + f"{concepts_dir}/non_administrative_categories.txt", + ] + number_config_file = f"{_SRC_DIR}/texts/configs/{_NUMBER_CONFIG.value}" + subprocess_utils.run_subprocess( + _TEXT_GENERATOR, + args=[ + "--generate_lexical_resources", "true", + "--concepts", ",".join(concept_files), + "--affix_lexicon", f"{params_dir}/affixes.tsv", + "--main_lexicon", f"{params_dir}/lexicon.tsv", + "--morphology_params", f"{params_dir}/morphology_params.textproto", + "--number_lexicon", f"{params_dir}/number_lexicon.tsv", + "--number_phon_rules", f"{params_dir}/number_phon_rules.far", + "--phon_rules", f"{params_dir}/phon_rules.far", + "--number_config_file", number_config_file, + "--max_homophony", _MAX_HOMOPHONY.value, + ] + ) + + # Now generate the accounting texts. + output_dir = f"{initial_dir}/output" + if not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + for set_idx in range(_NUM_SETS.value): + logging.info("Generating accounting texts set %d ...", set_idx) + subprocess_utils.run_subprocess( + _TEXT_GENERATOR, + args=[ + "--concepts", ",".join(concept_files), + "--affix_lexicon", f"{params_dir}/affixes.tsv", + "--main_lexicon", f"{params_dir}/lexicon.tsv", + "--morphology_params", f"{params_dir}/morphology_params.textproto", + "--number_lexicon", f"{params_dir}/number_lexicon.tsv", + "--number_phon_rules", f"{params_dir}/number_phon_rules.far", + "--phon_rules", f"{params_dir}/phon_rules.far", + "--number_config_file", number_config_file, + "--num_texts", _NUM_TEXTS.value, + "--probability_of_supercategory_glyph", + _PROBABILITY_OF_SUPERCATEGORY_GLYPH.value, + "--max_commodity", _MAX_COMMODITY.value, + "--output_texts", f"{output_dir}/accounts_{set_idx}.txt", + ] + ) + + # Create the vocabulary files. + logging.info("Making the vocabulary files ...") + subprocess_utils.run_subprocess( + _VOCAB_BUILDER, + args=[ + "--texts_glob", f"{output_dir}/accounts_[0-{_NUM_SETS.value}].txt", + "--glyph_syms", f"{params_dir}/glyphs.syms", + "--word_syms", f"{params_dir}/words.syms", + ] + ) + + # Build phonetic embeddings. + logging.info("Building phonetic embeddings ...") + subprocess_utils.run_subprocess( + _PHONETIC_EMBEDDINGS_BUILDER, + args=[ + "--main_lexicon", f"{params_dir}/lexicon.tsv", + "--number_lexicon", f"{params_dir}/number_lexicon.tsv", + "--embeddings", f"{params_dir}/phonetic_embeddings.tsv", + ] + ) + + # Copy semantic embeddings. + logging.info("Copying semantic embeddings ...") + sem_dir = f"{_RESOURCE_DIR}/data/semantics/bnc" + for filename in ["embeddings.txt", "numbers.txt"]: + src_file = os.path.join( + os.getcwd(), f"{sem_dir}/{filename}" + ) + dst_file = f"{params_dir}/{filename}" + shutil.copy2(src_file, dst_file) + + logging.info("Initial corpus generated in %s.", initial_dir) + + +if __name__ == "__main__": + app.run(main) diff --git a/protoscribe/texts/make_vocab_files_main.py b/protoscribe/texts/make_vocab_files_main.py index cdeacf0..057f8d6 100644 --- a/protoscribe/texts/make_vocab_files_main.py +++ b/protoscribe/texts/make_vocab_files_main.py @@ -66,11 +66,11 @@ def load_administrative_concepts() -> set[str]: def main(unused_argv): administrative_concepts = load_administrative_concepts() - glob = glob.glob(_TEXTS_GLOB.value) + paths = glob.glob(_TEXTS_GLOB.value) administrative_glyphs = set() non_administrative_glyphs = set() word_vocab = set() - for path in glob: + for path in paths: with open(path) as stream: reader = csv.reader(stream, delimiter="\t", quotechar='"') for row in reader: