Skip to content

Commit

Permalink
Simple textual corpus generation tool.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 703986758
  • Loading branch information
agutkin committed Dec 8, 2024
1 parent ea6af4f commit 0f444f3
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 2 deletions.
179 changes: 179 additions & 0 deletions protoscribe/texts/generate_simple_corpus_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# Copyright 2024 The Protoscribe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Helper tool for generating simple (textual-only) data for model training.
Example:
--------
Generates the initial data using defaults.
python protoscribe/texts/generate_simple_corpus_main.py \
--dataset_dir /tmp/protoscribe \
--logtostderr
"""

from collections.abc import Sequence
import logging
import os
import shutil

from absl import app
from absl import flags
from protoscribe.utils import subprocess_utils

import glob
import os
# Internal resources dependency

_DATASET_DIR = flags.DEFINE_string(
"dataset_dir", None,
"Parent directory for the dataset.",
required=True
)

_MAX_HOMOPHONY = flags.DEFINE_integer(
"max_homophony", 5,
"Maximum amount of homophony."
)

_NUMBER_CONFIG = flags.DEFINE_string(
"number_config", "number_config_sg_du_pl.textproto",
"Number generation configuration."
)

_NUM_SETS = flags.DEFINE_integer(
"num_sets", 5,
"Number of sets. of accounting texts to generate."
)

_NUM_TEXTS = flags.DEFINE_integer(
"num_texts", 10_000,
"Number of accounting documents to generate."
)

_MAX_COMMODITY = flags.DEFINE_integer(
"max_commodity", 99,
"Maximum cardinal representing the number of commodities."
)

_PROBABILITY_OF_SUPERCATEGORY_GLYPH = flags.DEFINE_float(
"probability_of_supercategory_glyph", 0.25,
"Probability of generating a supercategory glyph if one is available."
)

_SRC_DIR = "protoscribe"
_RESOURCE_DIR = "protoscribe"
_TEXT_GENERATOR = f"{_RESOURCE_DIR}/texts/generate"
_VOCAB_BUILDER = f"{_RESOURCE_DIR}/texts/make_vocab_files"
_PHONETIC_EMBEDDINGS_BUILDER = (
f"{_RESOURCE_DIR}/language/phonology/build_phonetic_embeddings"
)


def main(argv: Sequence[str]) -> None:
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")

# Prepare output directories.
initial_dir = f"{_DATASET_DIR.value}/initial_texts"
params_dir = f"{initial_dir}/params"
concepts_dir = f"{_SRC_DIR}/data/concepts"
if not os.path.exists(params_dir):
os.makedirs(params_dir, exist_ok=True)

# Generate lexicon resources.
logging.info("Generating the lexicon with ALL concepts in %s ...", params_dir)
concept_files = [
f"{concepts_dir}/administrative_categories.txt",
f"{concepts_dir}/non_administrative_categories.txt",
]
number_config_file = f"{_SRC_DIR}/texts/configs/{_NUMBER_CONFIG.value}"
subprocess_utils.run_subprocess(
_TEXT_GENERATOR,
args=[
"--generate_lexical_resources", "true",
"--concepts", ",".join(concept_files),
"--affix_lexicon", f"{params_dir}/affixes.tsv",
"--main_lexicon", f"{params_dir}/lexicon.tsv",
"--morphology_params", f"{params_dir}/morphology_params.textproto",
"--number_lexicon", f"{params_dir}/number_lexicon.tsv",
"--number_phon_rules", f"{params_dir}/number_phon_rules.far",
"--phon_rules", f"{params_dir}/phon_rules.far",
"--number_config_file", number_config_file,
"--max_homophony", _MAX_HOMOPHONY.value,
]
)

# Now generate the accounting texts.
output_dir = f"{initial_dir}/output"
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
for set_idx in range(_NUM_SETS.value):
logging.info("Generating accounting texts set %d ...", set_idx)
subprocess_utils.run_subprocess(
_TEXT_GENERATOR,
args=[
"--concepts", ",".join(concept_files),
"--affix_lexicon", f"{params_dir}/affixes.tsv",
"--main_lexicon", f"{params_dir}/lexicon.tsv",
"--morphology_params", f"{params_dir}/morphology_params.textproto",
"--number_lexicon", f"{params_dir}/number_lexicon.tsv",
"--number_phon_rules", f"{params_dir}/number_phon_rules.far",
"--phon_rules", f"{params_dir}/phon_rules.far",
"--number_config_file", number_config_file,
"--num_texts", _NUM_TEXTS.value,
"--probability_of_supercategory_glyph",
_PROBABILITY_OF_SUPERCATEGORY_GLYPH.value,
"--max_commodity", _MAX_COMMODITY.value,
"--output_texts", f"{output_dir}/accounts_{set_idx}.txt",
]
)

# Create the vocabulary files.
logging.info("Making the vocabulary files ...")
subprocess_utils.run_subprocess(
_VOCAB_BUILDER,
args=[
"--texts_glob", f"{output_dir}/accounts_[0-{_NUM_SETS.value}].txt",
"--glyph_syms", f"{params_dir}/glyphs.syms",
"--word_syms", f"{params_dir}/words.syms",
]
)

# Build phonetic embeddings.
logging.info("Building phonetic embeddings ...")
subprocess_utils.run_subprocess(
_PHONETIC_EMBEDDINGS_BUILDER,
args=[
"--main_lexicon", f"{params_dir}/lexicon.tsv",
"--number_lexicon", f"{params_dir}/number_lexicon.tsv",
"--embeddings", f"{params_dir}/phonetic_embeddings.tsv",
]
)

# Copy semantic embeddings.
logging.info("Copying semantic embeddings ...")
sem_dir = f"{_RESOURCE_DIR}/data/semantics/bnc"
for filename in ["embeddings.txt", "numbers.txt"]:
src_file = os.path.join(
os.getcwd(), f"{sem_dir}/{filename}"
)
dst_file = f"{params_dir}/{filename}"
shutil.copy2(src_file, dst_file)

logging.info("Initial corpus generated in %s.", initial_dir)


if __name__ == "__main__":
app.run(main)
4 changes: 2 additions & 2 deletions protoscribe/texts/make_vocab_files_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ def load_administrative_concepts() -> set[str]:

def main(unused_argv):
administrative_concepts = load_administrative_concepts()
glob = glob.glob(_TEXTS_GLOB.value)
paths = glob.glob(_TEXTS_GLOB.value)
administrative_glyphs = set()
non_administrative_glyphs = set()
word_vocab = set()
for path in glob:
for path in paths:
with open(path) as stream:
reader = csv.reader(stream, delimiter="\t", quotechar='"')
for row in reader:
Expand Down

0 comments on commit 0f444f3

Please sign in to comment.