From a660adb245c3d6247ba2dc8d3ed1aaf5ad0e5e98 Mon Sep 17 00:00:00 2001
From: Leo Park <leo.park@codethink.co.uk>
Date: Wed, 27 Nov 2024 16:18:41 +0000
Subject: [PATCH 1/5] Refactor and Update - Modularize and polish existing code
 - Implement Faker library with seed configuration - Add query testing
 functionality - Add Dockerfile - Include requirements.txt for dependency
 management

---
 es_test_data.py          | 333 -----------------------------------
 modules/__init__.py      |   0
 modules/config.py        | 144 +++++++++++++++
 modules/faker.py         |  33 ++++
 modules/generate_data.py | 368 +++++++++++++++++++++++++++++++++++++++
 modules/requests.py      | 263 ++++++++++++++++++++++++++++
 modules/shared.py        |  11 ++
 modules/upload_data.py   | 128 ++++++++++++++
 requirements.txt         |   3 +-
 search_test.py           |  66 +++++++
 10 files changed, 1015 insertions(+), 334 deletions(-)
 delete mode 100755 es_test_data.py
 create mode 100644 modules/__init__.py
 create mode 100644 modules/config.py
 create mode 100644 modules/faker.py
 create mode 100644 modules/generate_data.py
 create mode 100644 modules/requests.py
 create mode 100644 modules/shared.py
 create mode 100644 modules/upload_data.py
 create mode 100755 search_test.py

diff --git a/es_test_data.py b/es_test_data.py
deleted file mode 100755
index 78ba095..0000000
--- a/es_test_data.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/python
-
-import nest_asyncio
-nest_asyncio.apply()
-
-import json
-import csv
-import time
-import logging
-import random
-import string
-import uuid
-import datetime
-
-import tornado.gen
-import tornado.httpclient
-import tornado.ioloop
-import tornado.options
-
-try:
-    xrange
-    range = xrange
-except NameError:
-    pass
-
-async_http_client = tornado.httpclient.AsyncHTTPClient()
-headers = tornado.httputil.HTTPHeaders({"content-type": "application/json"})
-id_counter = 0
-upload_data_count = 0
-_dict_data = None
-
-
-
-def delete_index(idx_name):
-    try:
-        url = "%s/%s?refresh=true" % (tornado.options.options.es_url, idx_name)
-        request = tornado.httpclient.HTTPRequest(url, headers=headers, method="DELETE", request_timeout=240, auth_username=tornado.options.options.username, auth_password=tornado.options.options.password, validate_cert=tornado.options.options.validate_cert)
-        response = tornado.httpclient.HTTPClient().fetch(request)
-        logging.info('Deleting index  "%s" done   %s' % (idx_name, response.body))
-    except tornado.httpclient.HTTPError:
-        pass
-
-
-def create_index(idx_name):
-    schema = {
-        "settings": {
-            "index": {
-                "number_of_shards":   tornado.options.options.num_of_shards,
-                "number_of_replicas": tornado.options.options.num_of_replicas
-            }
-        }
-    }
-
-
-    body = json.dumps(schema)
-    url = "%s/%s" % (tornado.options.options.es_url, idx_name)
-    try:
-        logging.info('Trying to create index %s' % (url))
-        request = tornado.httpclient.HTTPRequest(url, headers=headers, method="PUT", body=body, request_timeout=240, auth_username=tornado.options.options.username, auth_password=tornado.options.options.password, validate_cert=tornado.options.options.validate_cert)
-        response = tornado.httpclient.HTTPClient().fetch(request)
-        logging.info('Creating index "%s" done   %s' % (idx_name, response.body))
-    except tornado.httpclient.HTTPError:
-        logging.info('Looks like the index exists already')
-        pass
-
-
-@tornado.gen.coroutine
-def upload_batch(upload_data_txt):
-    try:
-        request = tornado.httpclient.HTTPRequest(tornado.options.options.es_url + "/_bulk",
-                                                 method="POST",
-                                                 body=upload_data_txt,
-                                                 headers=headers,
-                                                 request_timeout=tornado.options.options.http_upload_timeout,
-                                                 auth_username=tornado.options.options.username, auth_password=tornado.options.options.password, validate_cert=tornado.options.options.validate_cert)
-        response = yield async_http_client.fetch(request)
-    except Exception as ex:
-        logging.error("upload failed, error: %s" % ex)
-        return
-
-    result = json.loads(response.body.decode('utf-8'))
-    res_txt = "OK" if not result['errors'] else "FAILED"
-    took = int(result['took'])
-    logging.info("Upload: %s - upload took: %5dms, total docs uploaded: %7d" % (res_txt, took, upload_data_count))
-
-
-def get_data_for_format(format):
-    split_f = format.split(":")
-    if not split_f:
-        return None, None
-
-    field_name = split_f[0]
-    field_type = split_f[1]
-
-    return_val = ''
-
-    if field_type == 'arr':
-        return_val = []
-        array_len_expr = split_f[2]
-        if '-' in array_len_expr:
-            (min,max) = array_len_expr.split('-')
-            array_len = generate_count(int(min), int(max))
-        else:
-            array_len = int(array_len_expr)
-
-        single_elem_format = field_name + ':' + format[len(field_name) + len(field_type) + len(array_len_expr) + 3 : ]
-        for i in range(array_len):
-            x = get_data_for_format(single_elem_format)
-            return_val.append(x[1])
-
-    elif field_type == "bool":
-        return_val = random.choice([True, False])
-
-    elif field_type == "str":
-        min = 3 if len(split_f) < 3 else int(split_f[2])
-        max = min + 7 if len(split_f) < 4 else int(split_f[3])
-        length = generate_count(min, max)
-        return_val = "".join([random.choice(string.ascii_letters + string.digits) for x in range(length)])
-
-    elif field_type == "int":
-        min = 0 if len(split_f) < 3 else int(split_f[2])
-        max = min + 100000 if len(split_f) < 4 else int(split_f[3])
-        return_val = generate_count(min, max)
-
-    elif field_type == "ipv4":
-        return_val = "{0}.{1}.{2}.{3}".format(generate_count(0, 245),generate_count(0, 245),generate_count(0, 245),generate_count(0, 245))
-
-    elif field_type in ["ts", "tstxt"]:
-        now = int(time.time())
-        per_day = 24 * 60 * 60
-        min = now - 30 * per_day if len(split_f) < 3 else int(split_f[2])
-        max = now + 30 * per_day if len(split_f) < 4 else int(split_f[3])
-        ts = generate_count(min, max)
-        return_val = int(ts * 1000) if field_type == "ts" else datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S.000-0000")
-
-    elif field_type == "words":
-        min = 2 if len(split_f) < 3 else int(split_f[2])
-        max = min + 8 if len(split_f) < 4 else int(split_f[3])
-        count = generate_count(min, max)
-        words = []
-        for _ in range(count):
-            word_len = random.randrange(3, 10)
-            words.append("".join([random.choice(string.ascii_letters + string.digits) for x in range(word_len)]))
-        return_val = " ".join(words)
-
-    elif field_type == "dict":
-        global _dict_data
-        min = 2 if len(split_f) < 3 else int(split_f[2])
-        max = min + 8 if len(split_f) < 4 else int(split_f[3])
-        count = generate_count(min, max)
-        return_val = " ".join([random.choice(_dict_data).strip() for _ in range(count)])
-
-    elif field_type == "text":
-        text = ["text1", "text2", "text3"] if len(split_f) < 3 else split_f[2].split("-")
-        min = 1 if len(split_f) < 4 else int(split_f[3])
-        max = min + 1 if len(split_f) < 5 else int(split_f[4])
-        count = generate_count(min, max)
-        words = []
-        for _ in range(count):
-            words.append(""+random.choice(text))
-        return_val = " ".join(words)
-
-    return field_name, return_val
-
-
-def generate_count(min, max):
-    if min == max:
-        return max
-    elif min > max:
-        return random.randrange(max, min);
-    else:
-        return random.randrange(min, max);
-
-
-def generate_random_doc(format):
-    global id_counter
-
-    res = {}
-
-    for f in format:
-        f_key, f_val = get_data_for_format(f)
-        if f_key:
-            res[f_key] = f_val
-
-    if not tornado.options.options.id_type:
-        return res
-
-    if tornado.options.options.id_type == 'int':
-        res['_id'] = id_counter
-        id_counter += 1
-    elif tornado.options.options.id_type == 'uuid4':
-        res['_id'] = str(uuid.uuid4())
-
-    return res
-
-
-def set_index_refresh(val):
-
-    params = {"index": {"refresh_interval": val}}
-    body = json.dumps(params)
-    url = "%s/%s/_settings" % (tornado.options.options.es_url, tornado.options.options.index_name)
-    try:
-        request = tornado.httpclient.HTTPRequest(url, headers=headers, method="PUT", body=body, request_timeout=240, auth_username=tornado.options.options.username, auth_password=tornado.options.options.password, validate_cert=tornado.options.options.validate_cert)
-        http_client = tornado.httpclient.HTTPClient()
-        http_client.fetch(request)
-        logging.info('Set index refresh to %s' % val)
-    except Exception as ex:
-        logging.exception(ex)
-
-
-def csv_file_to_json(csvFilePath):
-    data = []
-
-    # Open a csv reader called DictReader
-    with open(csvFilePath, encoding='utf-8') as csvf:
-        csvReader = csv.DictReader(csvf)
-        for rows in csvReader:
-            data.append(rows)
-
-    return json.dumps(data)
-
-
-@tornado.gen.coroutine
-def generate_test_data():
-
-    global upload_data_count
-
-    if tornado.options.options.force_init_index:
-        delete_index(tornado.options.options.index_name)
-
-    create_index(tornado.options.options.index_name)
-
-    # todo: query what refresh is set to, then restore later
-    if tornado.options.options.set_refresh:
-        set_index_refresh("-1")
-
-    if tornado.options.options.out_file:
-        out_file = open(tornado.options.options.out_file, "w")
-    else:
-        out_file = None
-
-    if tornado.options.options.dict_file:
-        global _dict_data
-        with open(tornado.options.options.dict_file, 'r') as f:
-            _dict_data = f.readlines()
-        logging.info("Loaded %d words from the %s" % (len(_dict_data), tornado.options.options.dict_file))
-
-    format = tornado.options.options.format.split(',')
-    if not format:
-        logging.error('invalid format')
-        exit(1)
-
-    ts_start = int(time.time())
-    upload_data_txt = ""
-
-    if tornado.options.options.data_file:
-        json_array = ""
-        if tornado.options.options.data_file.endswith(".csv"):
-            json_array = json.loads(csv_file_to_json(tornado.options.options.data_file))
-        else:
-            with open(tornado.options.options.data_file, 'r') as f:
-                json_array = json.load(f)
-            logging.info("Loaded documents from the %s", tornado.options.options.data_file)
-
-        for item in json_array:
-            cmd = {'index': {'_index': tornado.options.options.index_name}}
-            if '_id' in item:
-                cmd['index']['_id'] = item['_id']
-
-            upload_data_txt += json.dumps(cmd) + "\n"
-            upload_data_txt += json.dumps(item) + "\n"
-
-        if upload_data_txt:
-            yield upload_batch(upload_data_txt)
-    else:
-        logging.info("Generating %d docs, upload batch size is %d" % (tornado.options.options.count,
-                                                                      tornado.options.options.batch_size))
-        for num in range(0, tornado.options.options.count):
-
-            item = generate_random_doc(format)
-
-            if out_file:
-                out_file.write("%s\n" % json.dumps(item))
-
-            cmd = {'index': {'_index': tornado.options.options.index_name}}
-            if '_id' in item:
-                cmd['index']['_id'] = item['_id']
-
-            upload_data_txt += json.dumps(cmd) + "\n"
-            upload_data_txt += json.dumps(item) + "\n"
-            upload_data_count += 1
-
-            if upload_data_count % tornado.options.options.batch_size == 0:
-                yield upload_batch(upload_data_txt)
-                upload_data_txt = ""
-
-        # upload remaining items in `upload_data_txt`
-        if upload_data_txt:
-            yield upload_batch(upload_data_txt)
-
-    if tornado.options.options.set_refresh:
-        set_index_refresh("1s")
-
-    if out_file:
-        out_file.close()
-
-    took_secs = int(time.time() - ts_start)
-
-    logging.info("Done - total docs uploaded: %d, took %d seconds" % (tornado.options.options.count, took_secs))
-
-
-if __name__ == '__main__':
-    tornado.options.define("es_url", type=str, default='http://localhost:9200/', help="URL of your Elasticsearch node")
-    tornado.options.define("index_name", type=str, default='test_data', help="Name of the index to store your messages")
-    tornado.options.define("index_type", type=str, default='test_type', help="Type")
-    tornado.options.define("batch_size", type=int, default=1000, help="Elasticsearch bulk index batch size")
-    tornado.options.define("num_of_shards", type=int, default=2, help="Number of shards for ES index")
-    tornado.options.define("http_upload_timeout", type=int, default=3, help="Timeout in seconds when uploading data")
-    tornado.options.define("count", type=int, default=100000, help="Number of docs to generate")
-    tornado.options.define("format", type=str, default='name:str,age:int,last_updated:ts', help="message format")
-    tornado.options.define("num_of_replicas", type=int, default=0, help="Number of replicas for ES index")
-    tornado.options.define("force_init_index", type=bool, default=False, help="Force deleting and re-initializing the Elasticsearch index")
-    tornado.options.define("set_refresh", type=bool, default=False, help="Set refresh rate to -1 before starting the upload")
-    tornado.options.define("out_file", type=str, default=False, help="If set, write test data to out_file as well.")
-    tornado.options.define("id_type", type=str, default=None, help="Type of 'id' to use for the docs, valid settings are int and uuid4, None is default")
-    tornado.options.define("dict_file", type=str, default=None, help="Name of dictionary file to use")
-    tornado.options.define("data_file", type=str, default=None, help="Name of the documents file to use")
-    tornado.options.define("username", type=str, default=None, help="Username for elasticsearch")
-    tornado.options.define("password", type=str, default=None, help="Password for elasticsearch")
-    tornado.options.define("validate_cert", type=bool, default=True, help="SSL validate_cert for requests. Use false for self-signed certificates.")
-    tornado.options.parse_command_line()
-
-    tornado.ioloop.IOLoop.instance().run_sync(generate_test_data)
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/modules/config.py b/modules/config.py
new file mode 100644
index 0000000..e0db483
--- /dev/null
+++ b/modules/config.py
@@ -0,0 +1,144 @@
+"""
+This is a module that handles all the configuration used for OpenSearch and Elastic Search testing
+"""
+
+import logging
+import os
+
+from tornado.options import define
+from tornado.options import parse_config_file, options
+
+define(
+    "action",
+    type=str,
+    default="all",
+    help="Specify the action to be performed. The available options are \
+[generate_data, query_all, custom_query, delete_index, all]",
+)
+define("json_path", type=str, default=None, help="Query JSON file path")
+define(
+    "search_db_url",
+    type=str,
+    default="http://localhost:9200",
+    help="URL of your DB node",
+)
+define(
+    "index_name",
+    type=str,
+    default="test_data",
+    help="Name of the index to store your messages",
+)
+define("index_type", type=str, default="test_type", help="index Type")
+define(
+    "batch_size",
+    type=int,
+    default=1000,
+    help="bulk data batch size",
+)
+define(
+    "number_of_shards",
+    type=int,
+    default=1,
+    help="Number of shards for OS index",
+)
+define(
+    "http_upload_timeout",
+    type=int,
+    default=10,
+    help="Timeout in seconds when uploading data",
+)
+define("count", type=int, default=1000, help="Number of docs to generate")
+define(
+    "number_of_replicas",
+    type=int,
+    default=1,
+    help="Number of replicas",
+)
+define(
+    "force_init_index",
+    type=bool,
+    default=False,
+    help="Force deleting and re-initializing index",
+)
+define(
+    "set_refresh",
+    type=bool,
+    default=False,
+    help="Set refresh rate to -1 before starting the upload",
+)
+define(
+    "out_file",
+    type=bool,
+    default=False,
+    help="If set,write test data to out_file as well.",
+)
+define(
+    "id_type",
+    type=str,
+    default=None,
+    help="Type of 'id' to use for the docs, int and uuid4",
+)
+define(
+    "dict_file", type=str, default=None, help="Name of dictionary file to use"
+)
+define(
+    "data_file",
+    type=str,
+    default=None,
+    help="Name of the documents file to use",
+)
+define("username", type=str, default=None, help="Username for access DB")
+define("password", type=str, default=None, help="Password for access DB")
+define(
+    "client_cert",
+    type=str,
+    default=None,
+    help="filepath of CA certificates in PEM format",
+)
+define(
+    "client_key",
+    type=str,
+    default=None,
+    help="filepath of for client SSL key",
+)
+define(
+    "validate_cert",
+    type=bool,
+    default=True,
+    help="SSL validate_cert for requests. \
+       Use false for self-signed certificates.",
+)
+define(
+    "random_seed", type=int, default=None, help="Random Seed number for Faker"
+)
+define(
+    "start_time",
+    type=str,
+    default=None,
+    help="'%Y-%m-%d %H:%M:%S' Shape Start Time",
+)
+define(
+    "finish_time",
+    type=str,
+    default=None,
+    help="'%Y-%m-%d %H:%M:%S' Shape Finish Time)",
+)
+define(
+    "format",
+    type=str,
+    default="name:str,age:int,@timestamp:tstxt",
+    help="message format",
+)
+
+
+def load_config(config_file_path):
+    """
+    Parses and loads the config file at the given path.
+    """
+    if not os.path.isfile(config_file_path):
+        logging.info(
+            "%s does not exist. only parse command line", config_file_path
+        )
+        options.parse_command_line()
+    else:
+        parse_config_file(config_file_path)
diff --git a/modules/faker.py b/modules/faker.py
new file mode 100644
index 0000000..d152a4f
--- /dev/null
+++ b/modules/faker.py
@@ -0,0 +1,33 @@
+"""
+This module provides fake data generation functionality using the Faker library
+"""
+
+# pylint: disable=E0401
+from faker import Faker
+from faker.providers import BaseProvider
+from tornado.options import options
+
+
+class BothifyPatternProvider(BaseProvider):
+    """
+    Custom Faker provider generating class
+    """
+
+    def random_bothify_pattern(self, length):
+        """
+        Generate a random pattern of specified length using '?' and '#'
+
+        Args:
+            length (int): The length of the pattern to be generated.
+
+        Returns:
+            str: A string pattern containing mix of '?' and '#' characters.
+        """
+        return "".join(self.random_element(["?", "#"]) for _ in range(length))
+
+
+fake = Faker()
+fake.add_provider(BothifyPatternProvider)
+
+if options.random_seed is not None:
+    Faker.seed(options.random_seed)
diff --git a/modules/generate_data.py b/modules/generate_data.py
new file mode 100644
index 0000000..2be7288
--- /dev/null
+++ b/modules/generate_data.py
@@ -0,0 +1,368 @@
+"""
+This module handles functions that generate Fake data
+"""
+
+# pylint: disable=E0401
+import csv
+from datetime import datetime, timedelta
+import json
+import random
+import string
+import uuid
+
+from tornado.options import options
+
+from . import shared
+from .faker import fake
+
+
+def get_data_for_format(fmt):
+    """
+    split 'format' and gives to proper fuctions
+    """
+
+    split_f = fmt.split(":")
+    if not split_f:
+        return None, None
+
+    # Variables
+    return_val = ""
+
+    type_handlers = {
+        "arr": array_generate,
+        "bool": bool_generate,
+        "str": str_generate,
+        "int": int_generate,
+        "ipv4": ipv4_generate,
+        "ts": ts_generate,
+        "tstxt": ts_generate,
+        "words": words_generate,
+        "dict": dict_generate,
+        "text": text_generate,
+        "log_version": log_version_generate,
+        "sha": sha_generate,
+        "file_name": generate_filename,
+        "uuid": generate_uuid,
+        "systemd": generate_systemd,
+    }
+
+    handler = type_handlers.get(split_f[1])
+
+    if handler:
+        return_val = handler(fmt)
+
+    return split_f[0], return_val
+
+
+def set_start_time():
+    """
+    This fuctions is used to wcreate start time for default
+    """
+    if options.start_time is None:
+        now = datetime.now()
+        thirty_days_ago = now - timedelta(days=30)
+        return thirty_days_ago.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+    return change_datetime_to_iso8601_str(options.start_time)
+
+
+def set_finish_time():
+    """
+    This fuctions is used to create finish time for default
+    """
+    if options.start_time is None:
+        now = datetime.now()
+        thirty_days_later = now + timedelta(days=30)
+        return thirty_days_later.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+    return change_datetime_to_iso8601_str(options.finish_time)
+
+
+def convert_string_to_datetime(date_string):
+    """
+    string time to datetime
+    """
+    return datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.000Z")
+
+
+def change_datetime_to_iso8601_str(time_str):
+    """
+    Parses the input string into a datetime
+    object and outputs it in ISO 8601 format
+    """
+    date_time_obj = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
+
+    return date_time_obj.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+
+def generate_count(min_value, max_value):
+    """
+    This Function Generate Count using Faker
+    """
+    return fake.random_int(min=min_value, max=max_value)
+
+
+def split_fmt(fmt):
+    """
+    split format argument
+    """
+    split_f = fmt.split(":")
+    field_name = split_f[0]
+    field_type = split_f[1]
+
+    return (split_f, field_name, field_type)
+
+
+def array_generate(fmt):
+    """
+    generate random array
+    """
+    split_f, field_name, field_type = split_fmt(fmt)
+    value = []
+    array_len_expr = split_f[2]
+    if "-" in array_len_expr:
+        (min_value, max_value) = array_len_expr.split("-")
+        array_len = generate_count(int(min_value), int(max_value))
+    else:
+        array_len = int(array_len_expr)
+
+    single_elem_format = (
+        field_name
+        + ":"
+        + fmt[len(field_name) + len(field_type) + len(array_len_expr) + 3:]
+    )
+    for _ in range(array_len):
+        x = get_data_for_format(single_elem_format)
+        value.append(x[1])
+    return value
+
+
+def bool_generate(_):
+    """
+    generate random boolean
+    """
+    return fake.boolean()
+
+
+def str_generate(fmt):
+    """
+    str:min:max a word, made up of min to max random upper/lowercase and digit
+    """
+    split_f, _, _ = split_fmt(fmt)
+    min_value = 3 if len(split_f) < 3 else int(split_f[2])
+    max_value = min_value + 7 if len(split_f) < 4 else int(split_f[3])
+    length = generate_count(min_value, max_value)
+
+    return fake.bothify(fake.random_bothify_pattern(length))
+
+
+def int_generate(fmt):
+    """
+    generate a random integer between min and max (int:min:max)
+    """
+    split_f, _, _ = split_fmt(fmt)
+    min_value = 0 if len(split_f) < 3 else int(split_f[2])
+    max_value = min_value + 100000 if len(split_f) < 4 else int(split_f[3])
+    return_val = generate_count(min_value, max_value)
+
+    return return_val
+
+
+def ipv4_generate(_):
+    """
+    generate a fake ip
+    """
+    return fake.ipv4()
+
+
+def ts_generate(fmt):
+    """
+    generate a timestamp (in milliseconds) or
+    timestamp in the "%Y-%m-%dT%H:%M:%S.000Z" format
+    """
+
+    _, _, field_type = split_fmt(fmt)
+
+    min_value = set_start_time()
+    max_value = set_finish_time()
+
+    random_timestamp = fake.date_time_between_dates(
+        datetime_start=convert_string_to_datetime(min_value),
+        datetime_end=convert_string_to_datetime(max_value),
+    ).timestamp()
+
+    return_val = (
+        int(random_timestamp * 1000)
+        if field_type == "ts"
+        else datetime.fromtimestamp(random_timestamp).strftime(
+            "%Y-%m-%dT%H:%M:%S.000Z"
+        )
+    )
+    return return_val
+
+
+def words_generate(fmt):
+    """
+    Generate random number of strs, separated by space
+    """
+    split_f, _, _ = split_fmt(fmt)
+
+    min_value = 2 if len(split_f) < 3 else int(split_f[2])
+    max_value = min_value + 8 if len(split_f) < 4 else int(split_f[3])
+    count = generate_count(min_value, max_value)
+
+    words = []
+    for _ in range(count):
+        word_len = random.randrange(3, 10)
+        word = "".join(
+            fake.random_element(elements=string.ascii_letters + string.digits)
+            for _ in range(word_len)
+        )
+        words.append(word)
+
+    return " ".join(words)
+
+
+def dict_generate(fmt):
+    """
+    Generate a random number of entries from the dictionary file
+    """
+    split_f, _, _ = split_fmt(fmt)
+
+    min_value = 2 if len(split_f) < 3 else int(split_f[2])
+    max_value = min_value + 8 if len(split_f) < 4 else int(split_f[3])
+    count = generate_count(min_value, max_value)
+
+    return " ".join(
+        [
+            fake.random_element(elements=shared.DICT_DATA).strip()
+            for _ in range(count)
+        ]
+    )
+
+
+def text_generate(fmt):
+    """
+    generate random number of words seperated
+    by space from a given list of - seperated words
+    """
+    split_f, _, _ = split_fmt(fmt)
+
+    text = (
+        ["text1", "text2", "text3"]
+        if len(split_f) < 3
+        else split_f[2].split("-")
+    )
+    min_value = 1 if len(split_f) < 4 else int(split_f[3])
+    max_value = min_value + 1 if len(split_f) < 5 else int(split_f[4])
+    count = generate_count(min_value, max_value)
+    words = []
+    for _ in range(count):
+        words.append("" + random.choice(text))
+    return_val = " ".join(words)
+
+    return return_val
+
+
+def log_version_generate(_):
+    """
+    Generate fake version string
+    """
+    major = fake.random_int(min=0, max=9)
+    minor = fake.random_int(min=0, max=99)
+    patch = fake.random_int(min=0, max=99)
+    return f"v{major}.{minor}.{patch}"
+
+
+def sha_generate(_):
+    """
+    Generate fake sha hash string
+    """
+    return fake.sha1()
+
+
+def generate_filename(fmt):
+    """
+    Generate fake file
+    default extention ".py"
+    """
+    split_f, _, _ = split_fmt(fmt)
+
+    if len(split_f) == 2:
+        extention = "py"
+    else:
+        extention = split_f[2]
+
+    return fake.file_name(extension=f".{extention}")
+
+
+def generate_uuid(_):
+    """
+    Generate fake uuid
+    """
+    return fake.uuid4()
+
+
+def generate_random_doc(fmt):
+    # pylint: disable=global-statement
+    """
+    This Function Generate Random Data
+    """
+
+    res = {}
+
+    for f in fmt:
+        f_key, f_val = get_data_for_format(f)
+        if f_key:
+            res[f_key] = f_val
+
+    if not options.id_type:
+        return res
+
+    if options.id_type == "int":
+        res["_id"] = shared.ID_COUNTER
+
+        shared.ID_COUNTER += 1
+    elif options.id_type == "uuid4":
+        res["_id"] = str(uuid.uuid4())
+
+    return res
+
+
+def generate_systemd(_):
+    """
+    Generate a random service name.
+    """
+
+    suffixes = [
+        "service",
+        "socket",
+        "target",
+        "device",
+        "mount",
+        "automount",
+        "swap",
+        "timer",
+        "path",
+        "slice",
+        "scope",
+    ]
+
+    random_words = [fake.word() for _ in range(3)]
+    suffix = random.choice(suffixes)
+
+    return f"{'-'.join(random_words)}.{suffix}"
+
+
+def csv_file_to_json(csv_file_path):
+    """
+    Change CSV file to JSON file
+    """
+    data = []
+
+    with open(csv_file_path, encoding="utf-8") as csvf:
+        csv_reader = csv.DictReader(csvf)
+        for rows in csv_reader:
+            data.append(rows)
+
+    return json.dumps(data)
diff --git a/modules/requests.py b/modules/requests.py
new file mode 100644
index 0000000..c9371e8
--- /dev/null
+++ b/modules/requests.py
@@ -0,0 +1,263 @@
+"""
+This module handles functions related to HTTP requests
+"""
+
+# pylint: disable=E0401
+import os
+import json
+import logging
+
+import tornado
+from tornado.options import options
+from tornado.httpclient import HTTPRequest, HTTPClient, HTTPError
+from tornado.httputil import HTTPHeaders
+
+from .generate_data import (
+    set_finish_time,
+    set_start_time,
+)
+from . import shared
+
+
+def request_https_parameters(
+    url, request_method, body=None, request_timeout=240
+):
+    """
+    Generate HTTP Request Parameters
+    """
+    headers = HTTPHeaders({"content-type": "application/json"})
+
+    request_parameters = {
+        "url": url,
+        "headers": headers,
+        "method": request_method,
+        "request_timeout": request_timeout,
+        "auth_username": options.username,
+        "auth_password": options.password,
+        "validate_cert": options.validate_cert,
+    }
+
+    if options.client_key is not None:
+        request_parameters.update({"client_key": options.client_key})
+
+    if options.client_cert is not None:
+        request_parameters.update({"client_cert": options.client_cert})
+
+    if body is not None:
+        request_parameters.update({"body": body})
+
+    return request_parameters
+
+
+def delete_index(idx_name):
+    """
+    This Function Delete Index
+    """
+    try:
+        url = f"{options.search_db_url}/{idx_name}"
+        request_parameters = request_https_parameters(url, "DELETE")
+        request = HTTPRequest(**request_parameters)
+        response = HTTPClient().fetch(request)
+        logging.info("Deleting index '%s' done", idx_name)
+    except HTTPError as e:
+        logging.info("Error while Deleting index %s", e)
+
+
+def create_index(idx_name):
+    """
+    This Function Create Index
+    """
+    schema = {
+        "settings": {
+            "index": {
+                "number_of_shards": options.number_of_shards,
+                "number_of_replicas": options.number_of_replicas,
+            }
+        }
+    }
+
+    body = json.dumps(schema)
+    url = f"{options.search_db_url}/{idx_name}"
+    request_parameters = request_https_parameters(url, "PUT", body)
+    try:
+        logging.info("Trying to create index %s", url)
+        request = HTTPRequest(**request_parameters)
+        response = HTTPClient().fetch(request)
+        result = json.loads(response.body.decode("utf-8"))
+        logging.info("Creating index %s done   %s", idx_name, result)
+    except HTTPError:
+        logging.info("Looks like the index exists already")
+
+
+@tornado.gen.coroutine
+def upload_batch(upload_data_txt):
+    # pylint: disable=W0718
+    """
+    This Function Upload Batches
+    """
+    url = f"{options.search_db_url}/_bulk"
+    request_parameters = request_https_parameters(
+        url, "POST", upload_data_txt, options.http_upload_timeout
+    )
+    try:
+        request = HTTPRequest(**request_parameters)
+        response = HTTPClient().fetch(request)
+
+        result = json.loads(response.body.decode("utf-8"))
+        res_txt = "OK" if not result["errors"] else "FAILED"
+        took = int(result["took"])
+        logging.info(
+            "Upload: %s - upload took: %5dms, total docs uploaded: %7d",
+            res_txt,
+            took,
+            shared.UPLOAD_DATA_COUNT,
+        )
+
+    except HTTPError as e:
+        logging.error("upload failed, HTTP error: %s", e)
+
+    except Exception as e:
+        logging.error("Unexpected Error Occured: %s", e)
+
+
+def set_index_refresh(val):
+    # pylint: disable=W0718
+    """
+    Refresh Index of DB
+    """
+    body = json.dumps({"index": {"refresh_interval": val}})
+    url = f"{options.search_db_url}/{options.index_name}/_settings"
+    request_parameters = request_https_parameters(url, "PUT", body)
+    try:
+        request = HTTPRequest(**request_parameters)
+        HTTPClient().fetch(request)
+        logging.info("Set index refresh to %s", val)
+    except HTTPError as e:
+        logging.exception("HTTP Error while index refresh %s", e)
+
+
+def initial_search():
+    """
+    Performs the initial search and returns the scroll ID and total hits
+    """
+    url = f"{options.search_db_url}/{options.index_name}\
+/_search?scroll=1m&request_cache=false"
+
+    start_time = set_start_time()
+    finish_time = set_finish_time()
+
+    query = {
+        "size": 10000,
+        "query": {
+            "range": {
+                "@timestamp": {
+                    "gte": start_time,
+                    "lte": finish_time,
+                }
+            }
+        },
+        "track_total_hits": True,
+    }
+
+    body = json.dumps(query)
+    request_parameters = request_https_parameters(url, "POST", body)
+
+    try:
+        request = HTTPRequest(**request_parameters)
+        response = HTTPClient().fetch(request)
+        result = json.loads(response.body.decode("utf-8"))
+        total_hits = int(result["hits"]["total"]["value"])
+        scroll_id = result["_scroll_id"]
+        took = result["took"]
+
+    except HTTPError as e:
+        logging.error("Initial search failed, HTTP error: %s", e)
+
+    return scroll_id, total_hits, took
+
+
+def scroll_search(scroll_id):
+    """
+    Performs a scroll search using the provided scroll ID
+    """
+    url = f"{options.search_db_url}/_search/scroll"
+
+    body = json.dumps({"scroll": "1m", "scroll_id": scroll_id})
+    request_parameters = request_https_parameters(url, "POST", body)
+
+    try:
+        request = HTTPRequest(**request_parameters)
+        response = HTTPClient().fetch(request)
+        result = json.loads(response.body.decode("utf-8"))
+        scroll_id = result["_scroll_id"]
+        took = result["took"]
+
+    except HTTPError as e:
+        logging.error("Scroll search failed, HTTP error: %s", e.code)
+
+    return scroll_id, took
+
+
+def clear_scroll(scroll_id):
+    """
+    Clears the scroll context
+    """
+    url = f"{options.search_db_url}/_search/scroll/\
+{scroll_id}"
+
+    request_parameters = request_https_parameters(url, "DELETE")
+    request = HTTPRequest(**request_parameters)
+
+    try:
+        HTTPClient().fetch(request)
+        logging.info("Scroll context cleared successfully")
+    except HTTPError as e:
+        logging.error("Failed to clear scroll context, HTTP error: %s", e.code)
+
+
+def query_all():
+    """
+    Queries the all data within the specified
+    time range using scroll API
+    """
+    scroll_id, total_hits, total_time = initial_search()
+
+    total_pages = (total_hits + 9999) // 10000
+    logging.info("Total hits: %d, Total pages: %d", total_hits, total_pages)
+
+    for page_number in range(total_pages):
+        scroll_id, scroll_time = scroll_search(scroll_id)
+        total_time += scroll_time
+        logging.info("Retrieved page %d of %d", page_number + 1, total_pages)
+
+    clear_scroll(scroll_id)
+
+    logging.info("Total Querying time taken: %.2fms", total_time)
+
+
+def custom_query_search(json_path):
+    """
+    Custom query with json file
+    """
+    url = f"{options.search_db_url}/{options.index_name}\
+/_search?request_cache=false"
+
+    if os.path.isfile(json_path):
+        logging.error("Cannot find JSON file")
+        return None
+
+    with open(json_path, "r", encoding="utf-8") as file:
+        json_data = json.load(file)
+
+    body = json.dumps(json_data)
+    request_parameters = request_https_parameters(url, "POST", body)
+
+    try:
+        request = HTTPRequest(**request_parameters)
+        response = HTTPClient().fetch(request)
+        result = json.loads(response.body.decode("utf-8"))
+        took = result["took"]
+        logging.info("Total Querying time taken: %sms", took)
+
+    except HTTPError as e:
+        logging.error("Initial search failed, HTTP error: %s", e)
diff --git a/modules/shared.py b/modules/shared.py
new file mode 100644
index 0000000..bc8f75d
--- /dev/null
+++ b/modules/shared.py
@@ -0,0 +1,11 @@
+"""
+This module handles all Global values
+"""
+
+# pylint: disable=E0401
+from tornado.httpclient import AsyncHTTPClient
+
+UPLOAD_DATA_COUNT = 0
+DICT_DATA = None
+ID_COUNTER = 0
+async_http_client = AsyncHTTPClient()
diff --git a/modules/upload_data.py b/modules/upload_data.py
new file mode 100644
index 0000000..2a9f9e3
--- /dev/null
+++ b/modules/upload_data.py
@@ -0,0 +1,128 @@
+"""
+This module includes functions used to upload
+generated fake data to database.
+"""
+
+# pylint: disable=E0401
+import json
+import logging
+import sys
+import time
+
+import tornado
+from tornado.options import options
+
+from .generate_data import (
+    csv_file_to_json,
+    generate_random_doc,
+)
+from .requests import (
+    delete_index,
+    create_index,
+    set_index_refresh,
+    upload_batch,
+)
+from . import shared
+
+
+@tornado.gen.coroutine
+def generate_test_data():
+    # pylint: disable=R0912,R0915, R1732
+    """
+    Upload Test Data
+    """
+
+    if options.force_init_index:
+        delete_index(options.index_name)
+
+    create_index(options.index_name)
+
+    if options.set_refresh:
+        set_index_refresh("-1")
+
+    if options.out_file:
+        out_file = open(options.out_file, "w", encoding="utf-8")
+    else:
+        out_file = None
+
+    if options.dict_file:
+        with open(options.dict_file, "r", encoding="utf-8") as f:
+            shared.DICT_DATA = f.readlines()
+        logging.info(
+            "Loaded %d words from the %s",
+            len(shared.DICT_DATA),
+            options.dict_file,
+        )
+
+    fmt = options.format.split(",")
+    if not fmt:
+        logging.error("invalid format")
+        sys.exit(1)
+
+    ts_start = int(time.time())
+    upload_data_txt = ""
+
+    if options.data_file:
+        json_array = ""
+        if options.data_file.endswith(".csv"):
+            json_array = json.loads(csv_file_to_json(options.data_file))
+        else:
+            with open(options.data_file, "r", encoding="utf-8") as f:
+                json_array = json.load(f)
+            logging.info("Loaded documents from the %s", options.data_file)
+
+        for item in json_array:
+            cmd = {"index": {"_index": options.index_name}}
+            if "_id" in item:
+                cmd["index"]["_id"] = item["_id"]
+
+            upload_data_txt += json.dumps(cmd) + "\n"
+            upload_data_txt += json.dumps(item) + "\n"
+
+        if upload_data_txt:
+            upload_batch(upload_data_txt)
+    else:
+        logging.info(
+            "Generating %d docs, upload batch size is %d",
+            options.count,
+            options.batch_size,
+        )
+
+        for _ in range(0, options.count):
+            item = generate_random_doc(fmt)
+
+            if out_file:
+                out_file.write(f"{json.dumps(item)}\n")
+
+            cmd = {"index": {"_index": options.index_name}}
+            if "_id" in item:
+                cmd["index"]["_id"] = item["_id"]
+
+            upload_data_txt += json.dumps(cmd) + "\n"
+            try:
+                upload_data_txt += json.dumps(item) + "\n"
+            except Exception as e:
+                raise e
+            shared.UPLOAD_DATA_COUNT += 1
+
+            if shared.UPLOAD_DATA_COUNT % options.batch_size == 0:
+                upload_batch(upload_data_txt)
+                upload_data_txt = ""
+
+        # upload remaining items in `upload_data_txt`
+        if upload_data_txt:
+            upload_batch(upload_data_txt)
+
+    if options.set_refresh:
+        set_index_refresh("1s")
+
+    if out_file:
+        out_file.close()
+
+    took_secs = int(time.time() - ts_start)
+
+    logging.info(
+        "Done - total docs uploaded: %d, took %d seconds",
+        options.count,
+        took_secs,
+    )
diff --git a/requirements.txt b/requirements.txt
index fe9b6e3..10ea6e4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 tornado==6.4.2
-nest-asyncio==1.5.1
+nest-asyncio==1.6.0
+Faker==30.8.2
diff --git a/search_test.py b/search_test.py
new file mode 100755
index 0000000..ebe3aea
--- /dev/null
+++ b/search_test.py
@@ -0,0 +1,66 @@
+"""
+Entry Point
+"""
+
+# pylint: disable=E0401
+import time
+import logging
+import nest_asyncio
+import tornado
+from tornado.options import options
+from modules.config import load_config
+from modules.upload_data import generate_test_data
+from modules.requests import (
+    custom_query_search,
+    query_all,
+    delete_index,
+)
+
+
+def check_options():
+    """
+    Raise Error if there is nothing to do
+    """
+    active_options = [
+        "generate_data",
+        "query_all",
+        "delete_index",
+        "custom_query",
+        "all"
+    ]
+
+    if options.action not in active_options:
+        logging.error("Nothing to do, check the options")
+
+
+def main():
+    """
+    Main fuction for search_test
+    """
+    nest_asyncio.apply()
+    load_config("server.conf")
+    check_options()
+
+    if options.action in ("generate_data", "all"):
+        logging.info("***Start Data Generate Test***")
+        time.sleep(3)
+        tornado.ioloop.IOLoop.instance().run_sync(generate_test_data)
+
+    if options.action in ("query_all", "all"):
+        logging.info("***Start Query All Test***")
+        time.sleep(3)
+        tornado.ioloop.IOLoop.instance().run_sync(query_all)
+
+    if options.action == "custom_query":
+        logging.info("***Start Query test by using '%s'***", options.json_path)
+        time.sleep(3)
+        custom_query_search(options.json_path)
+
+    if options.action in ("delete_index", "all"):
+        logging.info("***Start Delete Index***")
+        time.sleep(3)
+        delete_index(options.index_name)
+
+
+if __name__ == "__main__":
+    main()

From f64aea749a1cf8b74959ec50bc14434f5dbed7a5 Mon Sep 17 00:00:00 2001
From: Leo Park <leo.park@codethink.co.uk>
Date: Wed, 27 Nov 2024 16:19:42 +0000
Subject: [PATCH 2/5] Update License

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index fcd8057..699fe46 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,7 @@
 The MIT License (MIT)
 
 Copyright (c) 2015 Oliver
+Copyright (c) 2024 Codethink
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,4 +20,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-

From b3da6d0630d83fe771ae3efcb73287e243683080 Mon Sep 17 00:00:00 2001
From: Leo Park <leo.park@codethink.co.uk>
Date: Wed, 27 Nov 2024 17:00:58 +0000
Subject: [PATCH 3/5] Docs: Update README.md

---
 README.md | 166 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 773de4f..8e4e55e 100644
--- a/README.md
+++ b/README.md
@@ -1,62 +1,57 @@
-# Quick fix for opensearch 2.6 
-- fix index creation
-- remove doc type
-
 # Elasticsearch For Beginners: Generate and Upload Randomized Test Data
 
 Because everybody loves test data.
 
 ## Ok, so what is this thing doing?
 
-`es_test_data.py` lets you generate and upload randomized test data to
-your ES cluster so you can start running queries, see what performance
+`search_test.py` lets you generate and upload randomized test data to
+your Elasticsearch or Opensearch cluster so you can start running queries, see what performance
 is like, and verify your cluster is able to handle the load.
 
 It allows for easy configuring of what the test documents look like, what
 kind of data types they include and what the field names are called.
 
-## Cool, how do I use this? 
+## Cool, how do I use this?
 
 ### Run Python script
 
-Let's assume you have an Elasticsearch cluster running.
+Let's assume you have an Elasticsearch or Opensearch cluster running.
 
-Python and [Tornado](https://github.com/tornadoweb/tornado/) are used. Run
-`pip install tornado` to install Tornado if you don't have it already.
+Python, [Tornado](https://github.com/tornadoweb/tornado/) and [Faker](https://github.com/joke2k/faker) are used. Run
+`pip install tornado` and `pip install Faker` to install if you don't have them already.
 
 It's as simple as this:
 
+```text
+$ python3 search_test.py
+
+***Start Data Generate Test***
+Trying to create index http://localhost:9200/test_data
+Creating index test_data done   {'acknowledged': True, 'shards_acknowledged': True, 'index': 'test_data'}
+Generating 2000 docs, upload batch size is 1000
+Upload: OK - upload took:   412ms, total docs uploaded:    1000
+Upload: OK - upload took:   197ms, total docs uploaded:    2000
+Done - total docs uploaded: 2000, took 1 seconds
+***Start Query All Test***
+Total hits: 2000, Total pages: 1
+Retrieved page 1 of 1
+Scroll context cleared successfully
+Total Querying time taken: 85.00ms
+***Start Delete Index***
+Deleting index 'test_data' done
 ```
-$ python es_test_data.py --es_url=http://localhost:9200
-[I 150604 15:43:19 es_test_data:42] Trying to create index http://localhost:9200/test_data
-[I 150604 15:43:19 es_test_data:47] Guess the index exists already
-[I 150604 15:43:19 es_test_data:184] Generating 10000 docs, upload batch size is 1000
-[I 150604 15:43:19 es_test_data:62] Upload: OK - upload took:    25ms, total docs uploaded:    1000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    25ms, total docs uploaded:    2000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    19ms, total docs uploaded:    3000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    18ms, total docs uploaded:    4000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    27ms, total docs uploaded:    5000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    19ms, total docs uploaded:    6000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    15ms, total docs uploaded:    7000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    24ms, total docs uploaded:    8000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    32ms, total docs uploaded:    9000
-[I 150604 15:43:20 es_test_data:62] Upload: OK - upload took:    31ms, total docs uploaded:   10000
-[I 150604 15:43:20 es_test_data:216] Done - total docs uploaded: 10000, took 1 seconds
-[I 150604 15:43:20 es_test_data:217] Bulk upload average:           23 ms
-[I 150604 15:43:20 es_test_data:218] Bulk upload median:            24 ms
-[I 150604 15:43:20 es_test_data:219] Bulk upload 95th percentile:   31 ms
-```
- 
+
 Without any command line options, it will generate and upload 1000 documents
 of the format
 
-```
+```json
 {
     "name":<<str>>,
     "age":<<int>>,
-    "last_updated":<<ts>>
+    "@timestamp":<<tstxt>>
 }
 ```
+
 to an Elasticsearch cluster at `http://localhost:9200` to an index called
 `test_data`.
 
@@ -70,7 +65,7 @@ Requires [Docker](https://docs.docker.com/get-docker/) for running the app and [
     ```
 1. Clone this repository
     ```bash
-    $ git clone https://github.com/oliver006/elasticsearch-test-data.git
+    $ git clone <change_this_to_repository_url>
     $ cd elasticsearch-test-data
     ```
 1. Run the ElasticSearch stack
@@ -92,38 +87,106 @@ Requires [Docker](https://docs.docker.com/get-docker/) for running the app and [
 
 ## Not bad but what can I configure?
 
-`python es_test_data.py --help` gives you the full set of command line
-ptions, here are the most important ones:
-
-- `--es_url=http://localhost:9200` the base URL of your ES node, don't
-  include the index name
-- `--username=<username>` the username when basic auth is required
-- `--password=<password>` the password when basic auth is required
+`python search-test.py --help` also gives you the full set of command line options,  
+here are more description about the most important ones:
+
+- `action`: [generate_data, query_all, custom_query, delete_index, all] choose one
+  - generate_data: upload the data generated through `format` to the OpenSearch database.
+  - query_all: request all values of the specified index within the range using `start_time` and `finish_time`. For this test, you need to generate time data using `@timestamp` for it to be callable. If you don't want to use that key, use `custom_query`.
+  - custom_query: You can specify the values for the body used in the request through a JSON file. this option require `json_path`. For more information read the api documentation - [ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-your-data.html#run-an-es-search) [OpenSearch](https://opensearch.org/docs/latest/api-reference/search/)
+  - delete_index: All data at the specified index will be deleted. (Please use with caution.)
+  - all: (default) Conduct whole process test.(generate_data -> query_all -> delete_index)
+- For authentication to the server, the following options are available:
+  - `validate_cert` : SSL validate_cert for requests. Use false for self-signed certificates
+  - Certificate based auth:
+    - `client_cert` : filepath of CA certificates in PEM format
+    - `client_key` : filepath of for client SSL key
+  - Username based auth:
+    - `username` : the username when basic auth is required
+    - `password` : the password when basic auth is required
+- `--search_db_url=http://localhost:9200` the base URL of your search DB node, don't include the index name
 - `--count=###` number of documents to generate and upload
 - `--index_name=test_data` the name of the index to upload the data to.
   If it doesn't exist it'll be created with these options
   - `--num_of_shards=2` the number of shards for the index
   - `--num_of_replicas=0` the number of replicas for the index
-- `--batch_size=###` we use bulk upload to send the docs to ES, this option
-  controls how many we send at a time
+- `--batch_size=###` we use bulk upload to send the docs to DB, this option controls how many we send at a time
 - `--force_init_index=False` if `True` it will delete and re-create the index
 - `--dict_file=filename.dic` if provided the `dict` data type will use words
   from the dictionary file, format is one word per line. The entire file is
   loaded at start-up so be careful with (very) large files.
 - `--data_file=filename.json|filename.csv` if provided all data in the filename will be inserted into es. The file content has to be an array of json objects (the documents). If the file ends in `.csv` then the data is automatically converted into json and inserted as documents.
 
-## What about the document format?
+### All configuration
+
+| Setting                 | Description                                                            | Default Value           |
+| ----------------------- | ---------------------------------------------------------------------- | ----------------------- |
+| action                  | Specify the action to be performed.                                    | all                     |
+| json_path               | Query JSON file path                                                   | None                    |
+| batch_size              | bulk index batch size                                                  | 1000                    |
+| client_cert             | Filepath of CA certificates in PEM format                              | None                    |
+| client_key              | Filepath of client SSL key                                             | None                    |
+| count                   | Number of docs to generate                                             | 1000                    |
+| data_file               | Name of the documents file to use                                      | None                    |
+| dict_file               | Name of dictionary file to use                                         | None                    |
+| finish_time             | Shape Finish Time in '%Y-%m-%d %H:%M:%S' format                        | None                    |
+| force_init_index        | Force deleting and re-initializing the index                           | False                   |
+| format                  | Message format                                                         | (truncated for brevity) |
+| http_upload_timeout     | Timeout in seconds when uploading data                                 | 10                      |
+| id_type                 | Type of 'id' to use for the docs, int or uuid4                         | None                    |
+| index_name              | Name of the index to store your messages                               | test_data                    |
+| index_type              | Index type                                                             | test_type               |
+| number_of_replicas      | Number of replicas                                                     | 1                       |
+| number_of_shards        | Number of shards                                                       | 1                       |
+| search_db_url           | URL of your DB                                                         | http://localhost:9200   |
+| out_file                | Write test data to out_file as well                                    | False                   |
+| password                | Password for DB                                                        | None                    |
+| random_seed             | Random seed number for Faker                                           | None                    |
+| set_refresh             | Set refresh rate to -1 before starting the upload                      | False                   |
+| start_time              | Shape Start Time in '%Y-%m-%d %H:%M:%S' format                         | None                    |
+| username                | Username for DB                                                        | None                    |
+| validate_cert           | SSL validate_cert for requests. Use false for self-signed certificates | True                    |
+
+### How to setup config file
+
+Recommended method for Config is create `server.conf` file and input the values needed.
+
+However, when there are many values to set, it is much more convenient to create and use a `server.conf` file.
+
+Enter the desired options in the `server.conf` file.
+
+Example:
+
+Create the configure file
+
+```shell
+cd ${REPOSITORY}/elasticsearch-test-data
+touch server.conf
+${EDITOR} server.conf
+```
+
+Edit configure file
+
+```conf
+# server.conf
+action = "all"
+opensearch_url = "https://uri.for.search.db:port"
+username = TEST_NAME
+password = TEST_PASSWORD
+```
+
+### What about the document format?
 
 Glad you're asking, let's get to the doc format.
 
 The doc format is configured via `--format=<<FORMAT>>` with the default being
-`name:str,age:int,last_updated:ts`.
+`name:str,age:int,@timestamp:tstxt`.
 
 The general syntax looks like this:
 
 `<<field_name>>:<<field_type>>,<<field_name>>::<<field_type>>, ...`
 
-For every document, `es_test_data.py` will generate random values for each of
+For every document, `search_test.py` will generate random values for each of
 the fields configured.
 
 Currently supported field types are:
@@ -146,16 +209,19 @@ Currently supported field types are:
   given list of `-` seperated words, the words are optional defaulting to
   `text1` `text2` and `text3`, min and max are optional, defaulting to `1`
   and `1`
-- `arr:[array_length_expression]:[single_element_format]` an array of entries 
-  with format specified by `single_element_format`. `array_length_expression` 
-  can be either a single number, or pair of numbers separated by `-` (i.e. 3-7), 
+- `arr:[array_length_expression]:[single_element_format]` an array of entries
+  with format specified by `single_element_format`. `array_length_expression`
+  can be either a single number, or pair of numbers separated by `-` (i.e. 3-7),
   defining range of lengths from with random length will be picked for each array
   (Example `int_array:arr:1-5:int:1:250`)
-
+- `log_version` a random version `str` looks like v1.1.1
+- `sha` generate random sha(len 40)
+- `file_name` Generate fake file `file_name:.py`  
+- `uuid` Generate fake uuid
+- `systemd` Generate fake systemd name
 
 ## Todo
 
-- document the remaining cmd line options
 - more different format types
 - ...
 

From e330e2c70d3627aa4329ee0714bf185992d86521 Mon Sep 17 00:00:00 2001
From: Leo Park <leo.park@codethink.co.uk>
Date: Fri, 29 Nov 2024 10:47:25 +0000
Subject: [PATCH 4/5] Update Dockerfile

---
 .dockerignore | 2 +-
 Dockerfile    | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 848b968..5d5535c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,3 @@
 **
 !requirements.txt
-!es_test_data.py
+!search_test.py
diff --git a/Dockerfile b/Dockerfile
index 6319e3d..59a22c1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -39,7 +39,8 @@ RUN pip install --ignore-installed --no-warn-script-location --prefix="/dist" -r
 
 WORKDIR /dist/
 
-COPY es_test_data.py .
+COPY modules ./modules/
+COPY search_test.py .
 
 # For debugging the Build Stage
 CMD ["bash"]
@@ -79,4 +80,4 @@ USER "$APP_USER_NAME"
 COPY --from=build --chown="$APP_USER_NAME":"$APP_GROUP_ID" /dist/ "$PYTHONUSERBASE"/
 
 # Use ENTRYPOINT instead CMD to force the container to start the application
-ENTRYPOINT ["python", "es_test_data.py"]
\ No newline at end of file
+ENTRYPOINT ["python", "search_test.py"]

From f9bc375531669f1312f903d07b4e897b23efcd4f Mon Sep 17 00:00:00 2001
From: Leo Park <leo.park@codethink.co.uk>
Date: Fri, 29 Nov 2024 11:37:11 +0000
Subject: [PATCH 5/5] CI: Update Run tests

---
 .drone.yml                  | 2 +-
 .github/workflows/tests.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 3f279c0..3025207 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -26,7 +26,7 @@ steps:
       - pip3 install -r requirements.txt
       - sleep 30
       - curl -s http://es:9200
-      - "python3 es_test_data.py -es_url=http://es:9200"
+      - "python3 search_test.py -es_url=http://es:9200"
     when:
       event:
         - pull_request
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 19dafd8..38604a0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -38,5 +38,5 @@ jobs:
           curl -s http://localhost:9200
 
       - name: Run tests
-        run: python3 es_test_data.py --count=1000 --es_url=http://localhost:9200
+        run: python3 search_test.py --count=2000 --search_db_url=http://localhost:9200