From d8c7f2db31c60ec3e670e9a5f48a19ca2bd1b8db Mon Sep 17 00:00:00 2001 From: Nicolay Rusnachenko Date: Mon, 4 Nov 2024 21:38:07 +0000 Subject: [PATCH] Refactoring #10 #11 #12 and all of that related to #13 --- README.md | 7 ++-- bulk_ner/annotate.py | 29 +++++++++----- bulk_ner/src/service.py | 71 ---------------------------------- bulk_ner/src/service_prompt.py | 14 +++++++ bulk_ner/src/utils.py | 23 +++++++++++ dependencies.txt | 1 + setup.py | 2 +- test/data/test.tsv | 7 ++++ 8 files changed, 70 insertions(+), 84 deletions(-) delete mode 100644 bulk_ner/src/service.py create mode 100644 bulk_ner/src/service_prompt.py create mode 100644 test/data/test.tsv diff --git a/README.md b/README.md index 90a208f..6633852 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# bulk-ner +# bulk-ner 0.24.1 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg) ![](https://img.shields.io/badge/AREkit-0.25.0-orange.svg) [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/ner-service/blob/main/NER_annotation_service.ipynb) @@ -18,7 +18,7 @@ The key benefits of this tiny framework are as follows: # Installation ```bash -pip install bulk-ner==0.24.0 +pip install bulk-ner==0.24.1 ``` # Usage @@ -27,10 +27,11 @@ This is an example for using `DeepPavlov==1.3.0` as an adapter for NER models pa ```bash python -m bulk_ner.annotate \ - --src "test/data/test.csv" \ + --src "test/data/test.tsv" \ --prompt "{text}" \ --batch-size 10 \ --adapter "dynamic:models/dp_130.py:DeepPavlovNER" \ + --output "test-annotated.jsonl" %% \ --model "ner_ontonotes_bert_mult" ``` diff --git a/bulk_ner/annotate.py b/bulk_ner/annotate.py index 1432377..d3d2cf4 100644 --- a/bulk_ner/annotate.py +++ b/bulk_ner/annotate.py @@ -1,8 +1,12 @@ import argparse import os import sys + from tqdm import tqdm +from source_iter.service_csv import CsvService +from source_iter.service_jsonl import JsonlService + from arekit.common.pipeline.batching import BatchingPipelineLauncher from arekit.common.pipeline.context import PipelineContext from arekit.common.pipeline.utils import BatchIterator @@ -10,10 +14,10 @@ from bulk_ner.src.entity import IndexedEntity from bulk_ner.src.pipeline.entity_list import HandleListPipelineItem from bulk_ner.src.pipeline.ner import NERPipelineItem -from bulk_ner.src.service import JsonlService, DataService, CsvService from bulk_ner.src.service_args import CmdArgsService from bulk_ner.src.service_dynamic import dynamic_init -from bulk_ner.src.utils import IdAssigner, iter_params, parse_filepath, test_ner_demo +from bulk_ner.src.service_prompt import DataService +from bulk_ner.src.utils import IdAssigner, iter_params, parse_filepath, test_ner_demo, setup_custom_logger def iter_annotated_data(texts_it, batch_size): @@ -38,13 +42,13 @@ def iter_annotated_data(texts_it, batch_size): if __name__ == '__main__': + + logger = setup_custom_logger("bulk-ner") parser = argparse.ArgumentParser(description="Apply NER annotation") parser.add_argument('--adapter', dest='adapter', type=str, default=None) parser.add_argument('--del-meta', dest="del_meta", type=list, default=["parent_ctx"]) - parser.add_argument('--csv-sep', dest='csv_sep', type=str, default='\t') - parser.add_argument('--csv-escape-char', dest='csv_escape_char', type=str, default=None) parser.add_argument('--prompt', dest='prompt', type=str, default="{text}") parser.add_argument('--src', dest='src', type=str, default=None) parser.add_argument('--output', dest='output', type=str, default=None) @@ -52,6 +56,7 @@ def iter_annotated_data(texts_it, batch_size): parser.add_argument('--chunk-limit', dest='chunk_limit', type=int, default=128) native_args, model_args = CmdArgsService.partition_list(lst=sys.argv, sep="%%") + custom_args_dict = CmdArgsService.args_to_dict(model_args) args = parser.parse_args(args=native_args[1:]) @@ -62,20 +67,24 @@ def iter_annotated_data(texts_it, batch_size): input_formatters = { None: lambda _: test_ner_demo( iter_answers=lambda example: iter_annotated_data(texts_it=iter([(0, example)]), batch_size=1)), - "csv": lambda filepath: CsvService.read(target=filepath, delimiter=args.csv_sep, - as_dict=True, skip_header=True, escapechar=args.csv_escape_char), - "jsonl": lambda filepath: JsonlService.read_lines(src=filepath) + "csv": lambda filepath: CsvService.read(src=filepath, as_dict=True, skip_header=True, + delimiter=custom_args_dict.get("delimiter", ","), + escapechar=custom_args_dict.get("escapechar", None)), + "tsv": lambda filepath: CsvService.read(src=filepath, as_dict=True, skip_header=True, + delimiter=custom_args_dict.get("delimiter", "\t"), + escapechar=custom_args_dict.get("escapechar", None)), + "jsonl": lambda filepath: JsonlService.read(src=filepath) } output_formatters = { - "jsonl": lambda dicts_it: JsonlService.write(output=args.output, lines_it=dicts_it) + "jsonl": lambda dicts_it: JsonlService.write(target=args.output, data_it=dicts_it) } # Initialize NER model models_preset = { "dynamic": lambda: dynamic_init(src_dir=CWD, class_filepath=ner_model_name, class_name=ner_model_params)( # The rest of parameters could be provided from cmd. - **CmdArgsService.args_to_dict(model_args)) + **custom_args_dict) } # Parse the model name. @@ -109,3 +118,5 @@ def iter_annotated_data(texts_it, batch_size): prompts_it = DataService.iter_prompt(data_dict_it=texts_it, prompt=args.prompt, parse_fields_func=iter_params) ctxs_it = iter_annotated_data(texts_it=prompts_it, batch_size=args.batch_size) output_formatters["jsonl"](dicts_it=tqdm(ctxs_it, desc=f"Processing `{args.src}`")) + + logger.info(f"Saved: {args.output}") diff --git a/bulk_ner/src/service.py b/bulk_ner/src/service.py deleted file mode 100644 index b8826b8..0000000 --- a/bulk_ner/src/service.py +++ /dev/null @@ -1,71 +0,0 @@ -import csv -import json - - -class JsonlService: - - @staticmethod - def write(output, lines_it): - with open(output, "w", encoding='utf8') as f: - for line in lines_it: - json.dump(line, fp=f, ensure_ascii=False) - f.write("\n") - - @staticmethod - def read_lines(src, row_id_key=None): - assert (isinstance(src, str)) - with open(src, "r") as f: - for line_ind, line in enumerate(f.readlines()): - content = json.loads(line) - if row_id_key is not None: - content[row_id_key] = line_ind - print(content) - yield content - - -class CsvService: - - @staticmethod - def read(target, skip_header=False, cols=None, as_dict=False, row_id_key=None, **csv_kwargs): - assert (isinstance(row_id_key, str) or row_id_key is None) - assert (isinstance(cols, list) or cols is None) - - header = None - with open(target, newline='\n') as f: - for row_id, row in enumerate(csv.reader(f, **csv_kwargs)): - if skip_header and row_id == 0: - header = ([row_id_key] if row_id_key is not None else []) + row - continue - - # Determine the content we wish to return. - if cols is None: - content = row - else: - row_d = {header[col_ind]: value for col_ind, value in enumerate(row)} - content = [row_d[col_name] for col_name in cols] - - content = ([row_id-1] if row_id_key is not None else []) + content - - # Optionally attach row_id to the content. - if as_dict: - assert (header is not None) - assert (len(content) == len(header)) - yield {k: v for k, v in zip(header, content)} - else: - yield content - - -class DataService(object): - - @staticmethod - def iter_prompt(data_dict_it, prompt, parse_fields_func): - """ This method composes prompt from the multiple fields, mentioned in it. - data_it: Iterator - iterator of the dict, from which we can collect data. - """ - assert(callable(parse_fields_func)) - field_names = list(parse_fields_func(prompt)) - for row_id, data_dict in enumerate(data_dict_it): - assert(isinstance(data_dict, dict)) - fmt_d = {col_name: data_dict[col_name] for col_name in field_names} - yield row_id, prompt.format(**fmt_d) diff --git a/bulk_ner/src/service_prompt.py b/bulk_ner/src/service_prompt.py new file mode 100644 index 0000000..5fff7c1 --- /dev/null +++ b/bulk_ner/src/service_prompt.py @@ -0,0 +1,14 @@ +class DataService(object): + + @staticmethod + def iter_prompt(data_dict_it, prompt, parse_fields_func): + """ This method composes prompt from the multiple fields, mentioned in it. + data_it: Iterator + iterator of the dict, from which we can collect data. + """ + assert(callable(parse_fields_func)) + field_names = list(parse_fields_func(prompt)) + for row_id, data_dict in enumerate(data_dict_it): + assert(isinstance(data_dict, dict)) + fmt_d = {col_name: data_dict[col_name] for col_name in field_names} + yield row_id, prompt.format(**fmt_d) diff --git a/bulk_ner/src/utils.py b/bulk_ner/src/utils.py index 7f53687..a7f9d5d 100644 --- a/bulk_ner/src/utils.py +++ b/bulk_ner/src/utils.py @@ -1,3 +1,7 @@ +import logging +import sys + + class IdAssigner(object): def __init__(self): @@ -48,3 +52,22 @@ def test_ner_demo(iter_answers=None): # Finally asking LLM. for a in iter_answers(user_input): print(a) + + +def setup_custom_logger(name, add_screen_handler=False, filepath=None): + formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + if add_screen_handler: + screen_handler = logging.StreamHandler(stream=sys.stdout) + screen_handler.setFormatter(formatter) + logger.addHandler(screen_handler) + + if filepath is not None: + handler = logging.FileHandler(filepath, mode='w') + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger \ No newline at end of file diff --git a/dependencies.txt b/dependencies.txt index 5bf9122..72c0621 100644 --- a/dependencies.txt +++ b/dependencies.txt @@ -1 +1,2 @@ arekit==0.25.0 +source-iter \ No newline at end of file diff --git a/setup.py b/setup.py index 704a406..f6b5732 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def get_requirements(filenames): setup( name='bulk_ner', - version='0.24.0', + version='0.24.1', python_requires=">=3.6", description='A no-strings inference implementation framework Named Entity Recognition (NER) ' 'service of wrapped AI models powered by AREkit and the related text-processing pipelines.', diff --git a/test/data/test.tsv b/test/data/test.tsv new file mode 100644 index 0000000..93c2961 --- /dev/null +++ b/test/data/test.tsv @@ -0,0 +1,7 @@ +text +It was in July, 1805, and the speaker was the well-known Anna Pávlovna +Schérer, maid of honor and favorite of the Empress Márya Fëdorovna. +With these words she greeted Prince Vasíli Kurágin, a man of high +rank and importance, who was the first to arrive at her reception. Anna +Pávlovna had had a cough for some days. She was, as she said, suffering +from la grippe; grippe being then a new word in St. Petersburg, used only by the elite. \ No newline at end of file