Skip to content

Commit

Permalink
Refactoring #10 #11 #12 and all of that related to #13
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolay-r committed Nov 4, 2024
1 parent 8c02d3b commit d8c7f2d
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 84 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# bulk-ner
# bulk-ner 0.24.1
![](https://img.shields.io/badge/Python-3.9-brightgreen.svg)
![](https://img.shields.io/badge/AREkit-0.25.0-orange.svg)
[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/ner-service/blob/main/NER_annotation_service.ipynb)
Expand All @@ -18,7 +18,7 @@ The key benefits of this tiny framework are as follows:
# Installation

```bash
pip install bulk-ner==0.24.0
pip install bulk-ner==0.24.1
```

# Usage
Expand All @@ -27,10 +27,11 @@ This is an example for using `DeepPavlov==1.3.0` as an adapter for NER models pa

```bash
python -m bulk_ner.annotate \
--src "test/data/test.csv" \
--src "test/data/test.tsv" \
--prompt "{text}" \
--batch-size 10 \
--adapter "dynamic:models/dp_130.py:DeepPavlovNER" \
--output "test-annotated.jsonl"
%% \
--model "ner_ontonotes_bert_mult"
```
Expand Down
29 changes: 20 additions & 9 deletions bulk_ner/annotate.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import argparse
import os
import sys

from tqdm import tqdm

from source_iter.service_csv import CsvService
from source_iter.service_jsonl import JsonlService

from arekit.common.pipeline.batching import BatchingPipelineLauncher
from arekit.common.pipeline.context import PipelineContext
from arekit.common.pipeline.utils import BatchIterator

from bulk_ner.src.entity import IndexedEntity
from bulk_ner.src.pipeline.entity_list import HandleListPipelineItem
from bulk_ner.src.pipeline.ner import NERPipelineItem
from bulk_ner.src.service import JsonlService, DataService, CsvService
from bulk_ner.src.service_args import CmdArgsService
from bulk_ner.src.service_dynamic import dynamic_init
from bulk_ner.src.utils import IdAssigner, iter_params, parse_filepath, test_ner_demo
from bulk_ner.src.service_prompt import DataService
from bulk_ner.src.utils import IdAssigner, iter_params, parse_filepath, test_ner_demo, setup_custom_logger


def iter_annotated_data(texts_it, batch_size):
Expand All @@ -38,20 +42,21 @@ def iter_annotated_data(texts_it, batch_size):


if __name__ == '__main__':

logger = setup_custom_logger("bulk-ner")

parser = argparse.ArgumentParser(description="Apply NER annotation")

parser.add_argument('--adapter', dest='adapter', type=str, default=None)
parser.add_argument('--del-meta', dest="del_meta", type=list, default=["parent_ctx"])
parser.add_argument('--csv-sep', dest='csv_sep', type=str, default='\t')
parser.add_argument('--csv-escape-char', dest='csv_escape_char', type=str, default=None)
parser.add_argument('--prompt', dest='prompt', type=str, default="{text}")
parser.add_argument('--src', dest='src', type=str, default=None)
parser.add_argument('--output', dest='output', type=str, default=None)
parser.add_argument('--batch-size', dest='batch_size', type=int, default=5)
parser.add_argument('--chunk-limit', dest='chunk_limit', type=int, default=128)

native_args, model_args = CmdArgsService.partition_list(lst=sys.argv, sep="%%")
custom_args_dict = CmdArgsService.args_to_dict(model_args)

args = parser.parse_args(args=native_args[1:])

Expand All @@ -62,20 +67,24 @@ def iter_annotated_data(texts_it, batch_size):
input_formatters = {
None: lambda _: test_ner_demo(
iter_answers=lambda example: iter_annotated_data(texts_it=iter([(0, example)]), batch_size=1)),
"csv": lambda filepath: CsvService.read(target=filepath, delimiter=args.csv_sep,
as_dict=True, skip_header=True, escapechar=args.csv_escape_char),
"jsonl": lambda filepath: JsonlService.read_lines(src=filepath)
"csv": lambda filepath: CsvService.read(src=filepath, as_dict=True, skip_header=True,
delimiter=custom_args_dict.get("delimiter", ","),
escapechar=custom_args_dict.get("escapechar", None)),
"tsv": lambda filepath: CsvService.read(src=filepath, as_dict=True, skip_header=True,
delimiter=custom_args_dict.get("delimiter", "\t"),
escapechar=custom_args_dict.get("escapechar", None)),
"jsonl": lambda filepath: JsonlService.read(src=filepath)
}

output_formatters = {
"jsonl": lambda dicts_it: JsonlService.write(output=args.output, lines_it=dicts_it)
"jsonl": lambda dicts_it: JsonlService.write(target=args.output, data_it=dicts_it)
}

# Initialize NER model
models_preset = {
"dynamic": lambda: dynamic_init(src_dir=CWD, class_filepath=ner_model_name, class_name=ner_model_params)(
# The rest of parameters could be provided from cmd.
**CmdArgsService.args_to_dict(model_args))
**custom_args_dict)
}

# Parse the model name.
Expand Down Expand Up @@ -109,3 +118,5 @@ def iter_annotated_data(texts_it, batch_size):
prompts_it = DataService.iter_prompt(data_dict_it=texts_it, prompt=args.prompt, parse_fields_func=iter_params)
ctxs_it = iter_annotated_data(texts_it=prompts_it, batch_size=args.batch_size)
output_formatters["jsonl"](dicts_it=tqdm(ctxs_it, desc=f"Processing `{args.src}`"))

logger.info(f"Saved: {args.output}")
71 changes: 0 additions & 71 deletions bulk_ner/src/service.py

This file was deleted.

14 changes: 14 additions & 0 deletions bulk_ner/src/service_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class DataService(object):

@staticmethod
def iter_prompt(data_dict_it, prompt, parse_fields_func):
""" This method composes prompt from the multiple fields, mentioned in it.
data_it: Iterator
iterator of the dict, from which we can collect data.
"""
assert(callable(parse_fields_func))
field_names = list(parse_fields_func(prompt))
for row_id, data_dict in enumerate(data_dict_it):
assert(isinstance(data_dict, dict))
fmt_d = {col_name: data_dict[col_name] for col_name in field_names}
yield row_id, prompt.format(**fmt_d)
23 changes: 23 additions & 0 deletions bulk_ner/src/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import logging
import sys


class IdAssigner(object):

def __init__(self):
Expand Down Expand Up @@ -48,3 +52,22 @@ def test_ner_demo(iter_answers=None):
# Finally asking LLM.
for a in iter_answers(user_input):
print(a)


def setup_custom_logger(name, add_screen_handler=False, filepath=None):
formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)

if add_screen_handler:
screen_handler = logging.StreamHandler(stream=sys.stdout)
screen_handler.setFormatter(formatter)
logger.addHandler(screen_handler)

if filepath is not None:
handler = logging.FileHandler(filepath, mode='w')
handler.setFormatter(formatter)
logger.addHandler(handler)

return logger
1 change: 1 addition & 0 deletions dependencies.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
arekit==0.25.0
source-iter
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_requirements(filenames):

setup(
name='bulk_ner',
version='0.24.0',
version='0.24.1',
python_requires=">=3.6",
description='A no-strings inference implementation framework Named Entity Recognition (NER) '
'service of wrapped AI models powered by AREkit and the related text-processing pipelines.',
Expand Down
7 changes: 7 additions & 0 deletions test/data/test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
text
It was in July, 1805, and the speaker was the well-known Anna Pávlovna
Schérer, maid of honor and favorite of the Empress Márya Fëdorovna.
With these words she greeted Prince Vasíli Kurágin, a man of high
rank and importance, who was the first to arrive at her reception. Anna
Pávlovna had had a cough for some days. She was, as she said, suffering
from la grippe; grippe being then a new word in St. Petersburg, used only by the elite.

0 comments on commit d8c7f2d

Please sign in to comment.