From 71e33a3abe4cf18e6a49b8c7d212958b346ba395 Mon Sep 17 00:00:00 2001
From: iris <84595986+iris-garden@users.noreply.github.com>
Date: Tue, 20 Aug 2024 13:33:09 -0400
Subject: [PATCH] wip
---
discourse-export-script/.gitignore | 3 +
discourse-export-script/discourse_download.py | 251 ++++++++++++++++++
discourse-export-script/github_upload.py | 231 ++++++++++++++++
.../org.mockito.plugins.MockMaker | 2 +-
4 files changed, 486 insertions(+), 1 deletion(-)
create mode 100644 discourse-export-script/.gitignore
create mode 100644 discourse-export-script/discourse_download.py
create mode 100644 discourse-export-script/github_upload.py
diff --git a/discourse-export-script/.gitignore b/discourse-export-script/.gitignore
new file mode 100644
index 00000000000..6b7d7596f4a
--- /dev/null
+++ b/discourse-export-script/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+discourse-export/
+uploaded/
diff --git a/discourse-export-script/discourse_download.py b/discourse-export-script/discourse_download.py
new file mode 100644
index 00000000000..567eec6c5f8
--- /dev/null
+++ b/discourse-export-script/discourse_download.py
@@ -0,0 +1,251 @@
+from argparse import ArgumentParser
+from asyncio import gather, run, sleep
+from dataclasses import dataclass
+from datetime import datetime
+from html.parser import HTMLParser
+from json import dump, loads
+from re import sub
+from typing import Any, Callable, Dict, List, Tuple, TypeVar
+
+from aiohttp import ClientSession
+
+strptime = datetime.strptime
+strftime = datetime.strftime
+
+# constants
+POST_LINK_ID = "f4706281-cc60-4ff0-a0b6-b803683cc24b"
+COMMENT_END_ID = "917f6034-2117-4a8c-bb42-b27fd7fb5e83"
+
+# types
+CallbackResponse = TypeVar("CallbackResponse")
+Callback = Callable[[Dict[str, Any]], CallbackResponse]
+
+
+@dataclass(frozen=True)
+class DiscoursePost:
+ id: int
+ topic_id: int
+ username: str
+ created_at: str
+ html: str
+
+
+@dataclass(frozen=True)
+class DiscourseTopic:
+ id: int
+ slug: str
+ title: str
+ html: str
+
+
+# html parser
+class DiscourseHTMLParser(HTMLParser):
+ def __init__(self: "DiscourseHTMLParser") -> None:
+ super().__init__()
+ self.output_html = ""
+ # relative file links starting with /
+ self.relative_link = False
+ # link previews and quotes
+ self.aside = False
+ self.aside_src = None
+ self.aside_src_written = False
+ self.aside_header = False
+ self.aside_header_link = False
+ self.aside_header_link_written = False
+ # code blocks
+ self.code_block_pre = False
+ self.code_block_code = False
+ # @ mentions
+ self.mention = False
+
+ def _decl_handler(self: "DiscourseHTMLParser", decl: str) -> None:
+ self.output_html += f""
+
+ def _ref_handler(self: "DiscourseHTMLParser", name: str) -> None:
+ self.output_html += f"&{name};"
+
+ def _write_starttag(self: "DiscourseHTMLParser", attrs: List[Tuple[str, str]], tag: str, suffix: str) -> None:
+ attr_str_prefix = " " if len(attrs) > 0 else ""
+ attr_str = " ".join([f'{key}="{value}"' for key, value in attrs])
+ self.output_html += f"<{tag}{attr_str_prefix}{attr_str}{suffix}>"
+
+ def _starttag_handler(suffix: str = "") -> None:
+ def inner(self: "DiscourseHTMLParser", tag: str, attrs: List[Tuple[str, str]]) -> None:
+ attr_dict = dict(attrs)
+ if ((not self.aside) or self.aside_header) and tag == "a":
+ if self.aside_header and not self.aside_header_link_written:
+ self.aside_header_link = True
+ link = attr_dict.get("href", "")
+ if "mention" in attr_dict.get("class", ""):
+ self.mention = True
+ elif link.startswith("/"):
+ self.relative_link = True
+ elif "https://discuss.hail.is/t/" in link:
+ slug = link.removeprefix("https://discuss.hail.is/t/").split("/")[0]
+ self.output_html += f''
+ else:
+ self._write_starttag(attrs, tag, suffix)
+ elif (
+ self.aside
+ and self.aside_src is None
+ and (tag == "header" or (tag == "div" and "title" in attr_dict.get("class", "")))
+ ):
+ self.aside_header = True
+ self.output_html += "\n"
+ elif self.aside_header and tag == "blockquote":
+ self.aside = False
+ self.aside_header = False
+ self.aside_header_link_written = False
+ self._write_starttag(attrs, tag, suffix)
+ elif self.aside_header and tag == "article":
+ self.aside_header = False
+ self.aside_header_link_written = False
+ elif not self.aside:
+ if tag == "aside":
+ self.aside = True
+ onebox_src = attr_dict.get("data-onebox-src", None)
+ if onebox_src is not None:
+ self.aside_src = onebox_src
+ self.output_html += f'\n'
+ elif tag == "pre":
+ self.code_block_pre = True
+ elif self.code_block_pre:
+ if tag == "code":
+ self.output_html += "\n\n```python\n"
+ self.code_block_code = True
+ else:
+ self._write_starttag(attrs, tag, suffix)
+
+ return inner
+
+ handle_charref = _ref_handler
+ handle_decl = _decl_handler
+ handle_entityref = _ref_handler
+ handle_startendtag = _starttag_handler(" /")
+ handle_starttag = _starttag_handler()
+ unknown_decl = _decl_handler
+
+ def handle_comment(self: "DiscourseHTMLParser", data: str) -> None:
+ self.output_html += f""
+
+ def handle_data(self: "DiscourseHTMLParser", data: str) -> None:
+ if "https://discuss.hail.is/t/" in data:
+ data = sub('https://discuss.hail.is/t/([A-Za-z0-9\\-]*?)', f'{POST_LINK_ID}/\\1', data)
+ if self.mention:
+ self.output_html += f'{data.partition("@")[2]}'
+ elif self.aside_src is not None and not self.aside_src_written:
+ self.output_html += self.aside_src
+ self.aside_src_written = True
+ elif (not self.aside) or self.aside_header_link:
+ self.output_html += data
+
+ def handle_endtag(self: "DiscourseHTMLParser", tag: str) -> None:
+ if ((not self.aside) or self.aside_header) and tag == "a":
+ if self.mention:
+ self.mention = False
+ elif self.relative_link:
+ self.relative_link = False
+ else:
+ if self.aside_header_link:
+ self.aside_header_link = False
+ self.aside_header_link_written = True
+ self.output_html += ""
+ elif tag == "aside":
+ self.aside = False
+ if self.aside_src is not None:
+ self.output_html += "\n"
+ self.aside_src = None
+ self.aside_src_written = False
+ self.aside_header_link_written = True
+ elif not self.aside:
+ if tag == "pre":
+ self.code_block_pre = False
+ elif self.code_block_pre:
+ if tag == "code":
+ self.output_html += "\n```\n\n"
+ self.code_block_code = False
+ else:
+ self.output_html += f"{tag}>"
+
+ def handle_pi(self: "DiscourseHTMLParser", data: str) -> None:
+ self.output_html += f"{data}>"
+
+
+# main script
+async def main(discourse_page: int) -> None:
+ async with ClientSession() as session:
+ pages = await run_tasks(
+ [parse_page(discourse_page, session)]
+ # [parse_page(page, session) for page in range(discourse_page + 1)]
+ )
+ topics = await run_tasks([
+ parse_topic(topic["id"], session) for page in pages for topic in page["topic_list"]["topics"]
+ ])
+ posts = await run_tasks([
+ parse_post(post["id"], session) for topic in topics for post in topic["post_stream"]["posts"]
+ ])
+
+ topic_acc = {topic["id"]: {"fields": topic, "posts": []} for topic in topics}
+ for post in posts:
+ topic_acc[post.topic_id]["posts"].append(post)
+
+ topics = []
+ for topic_id, topic in topic_acc.items():
+ if topic["fields"]["slug"] != "welcome-to-the-hail-community":
+ topic_html = ""
+ for idx, post in enumerate(topic["posts"]):
+ parser = DiscourseHTMLParser()
+ parser.feed(post.html)
+ topic_html += f"> [!NOTE]\n> The following post was exported from discuss.hail.is, a forum for asking questions about Hail which has since been deprecated.\n\n## ({strptime(post.created_at, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%b %d, %Y at %H:%M')}) {post.username} said:\n{parser.output_html} {COMMENT_END_ID if idx < (len(topic['posts']) - 1) else ''}"
+ with open(f'./discourse-export/{topic["fields"]["id"]:04}_{topic["fields"]["slug"]}.json', 'w') as file:
+ dump(
+ {
+ "id": topic["fields"]["id"],
+ "slug": topic["fields"]["slug"],
+ "title": topic["fields"]["title"],
+ "html": topic_html,
+ },
+ file,
+ )
+
+
+async def run_tasks(tasks):
+ result = []
+ while len(tasks) != 0:
+ if len(tasks) > 5:
+ result += await gather(*tasks[:4])
+ tasks = tasks[4:]
+ await sleep(2)
+ else:
+ result += await gather(*tasks)
+ tasks = []
+ return result
+
+
+async def parse_page(discourse_page: int, session: ClientSession) -> None:
+ async with session.get(f"https://discuss.hail.is/latest.json?page={discourse_page}") as response:
+ return await response.json()
+
+
+async def parse_topic(topic_id: int, session: ClientSession) -> None:
+ async with session.get(f"https://discuss.hail.is/t/{topic_id}.json") as response:
+ return loads(await response.read())
+
+
+async def parse_post(post_id: int, session: ClientSession) -> None:
+ async with session.get(f"https://discuss.hail.is/posts/{post_id}.json") as response:
+ response_json = loads(await response.read())
+ return DiscoursePost(
+ response_json["id"],
+ response_json["topic_id"],
+ response_json["username"],
+ response_json["created_at"],
+ response_json["cooked"],
+ )
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser()
+ parser.add_argument("--page")
+ args = parser.parse_args()
+ run(main(int(args.page)))
diff --git a/discourse-export-script/github_upload.py b/discourse-export-script/github_upload.py
new file mode 100644
index 00000000000..b11660fa338
--- /dev/null
+++ b/discourse-export-script/github_upload.py
@@ -0,0 +1,231 @@
+from argparse import ArgumentParser
+from asyncio import gather, run, sleep
+from datetime import datetime
+from json import dumps, load, loads
+from os import listdir
+from os.path import isfile, join
+from re import findall, sub
+from shutil import move
+
+from aiohttp import ClientSession
+
+now = datetime.now
+fromtimestamp = datetime.fromtimestamp
+
+# constants
+POST_LINK_ID = "f4706281-cc60-4ff0-a0b6-b803683cc24b"
+COMMENT_END_ID = "917f6034-2117-4a8c-bb42-b27fd7fb5e83"
+
+
+# TODO i think the numbering is off?
+async def main(github_issue_number: int, github_token: str) -> None:
+ links = {}
+ for idx, filename in enumerate(sorted(listdir("./discourse-export"))):
+ if isfile(join("./discourse-export", filename)):
+ id, rest = filename.split("_")
+ slug, _ = rest.split(".")
+ with open(f"./discourse-export/{id}_{slug}.json", "r") as file:
+ links[slug] = {
+ "id": id,
+ "idx": idx,
+ "dests": set([slug for slug in findall(f'{POST_LINK_ID}/([A-Za-z0-9\\-]*?)\\\\"', file.read())]),
+ }
+ for slug, data in links.items():
+ for dest in data["dests"]:
+ dest_data = links.get(dest, None)
+ if dest_data is None:
+ print(
+ f"broken link: {slug}->{dest} (https://github.com/hail-is/hail/issues/{github_issue_number + data['idx']})"
+ )
+ else:
+ print(
+ f"link: {slug} (https://github.com/hail-is/hail/issues/{github_issue_number + data['idx']}) -> {dest} (https://github.com/hail-is/hail/issues/{github_issue_number + dest_data['idx']})"
+ )
+ with open(f"./discourse-export/{data['id']}_{slug}.json", "r") as file:
+ json = sub(
+ f'{POST_LINK_ID}/{dest}"',
+ f"https://github.com/hail-is/hail/issues/{github_issue_number + dest_data['idx']}\\\"",
+ file.read(),
+ )
+ with open(f"./discourse-export/{data['id']}_{slug}.json", "w") as file:
+ file.write(json)
+ async with ClientSession() as session:
+ for issue in sorted([{"slug": slug, **data} for slug, data in links.items()], key=lambda x: x["idx"]):
+ with open(f"./discourse-export/{issue['id']}_{issue['slug']}.json", "r") as file:
+ topic = load(file)
+ discussion_id, label_applied, comment_idx = [None, False, 0]
+ comments = topic["html"].split(COMMENT_END_ID)
+ discussion_html = comments[0]
+ rest_comments = comments[1:]
+ while discussion_id is None:
+ discussion_id = next(
+ iter(await gather(create_discussion(discussion_html, topic["title"], session, github_token)))
+ )
+ while not label_applied:
+ label_applied = next(iter(await gather(apply_label(discussion_id, session, github_token))))
+ while comment_idx < (len(rest_comments)):
+ comment_idx = next(
+ iter(
+ await gather(
+ add_comment(comment_idx, rest_comments[comment_idx], discussion_id, session, github_token)
+ )
+ )
+ )
+ move(
+ f"./discourse-export/{issue['id']}_{issue['slug']}.json",
+ f"./uploaded/{issue['id']}_{issue['slug']}.json",
+ )
+
+
+async def add_comment(comment_idx, comment_html, discussion_id, session, github_token):
+ comment_query = f"""
+ mutation {{
+ addDiscussionComment (
+ input: {{
+ discussionId: "{discussion_id}"
+ body: {dumps(comment_html)}
+ }}
+ ) {{
+ comment {{
+ id
+ }}
+ }}
+ }}
+ """
+ async with session.post(
+ "https://api.github.com/graphql",
+ json={"query": comment_query},
+ headers={
+ "Accept": "application/vnd.github+json",
+ "Authorization": f"Bearer {github_token}",
+ "Content-Type": "application/json; charset=utf-8",
+ "X-GitHub-Api-Version": "2022-11-28",
+ },
+ ) as comment_response:
+ comment_response_json = loads(await comment_response.read())
+ if comment_response_json.get("errors", None) is not None:
+ print(comment_response_json)
+ await handle_error(comment_response.headers)
+ return comment_idx
+ return comment_idx + 1
+
+
+async def apply_label(discussion_id, session, github_token):
+ label_query = f"""
+ mutation {{
+ addLabelsToLabelable (
+ input: {{
+ labelableId: "{discussion_id}"
+ labelIds: ["LA_kwDOKFqpFc8AAAABajc5aQ"]
+ }}
+ ) {{
+ labelable {{
+ labels {{
+ totalCount
+ }}
+ }}
+ }}
+ }}
+ """
+ async with session.post(
+ "https://api.github.com/graphql",
+ json={"query": label_query},
+ headers={
+ "Accept": "application/vnd.github+json",
+ "Authorization": f"Bearer {github_token}",
+ "Content-Type": "application/json; charset=utf-8",
+ "X-GitHub-Api-Version": "2022-11-28",
+ },
+ ) as label_response:
+ label_response_json = loads(await label_response.read())
+ if label_response_json.get("errors", None) is not None:
+ print(label_response_json)
+ await handle_error(label_response.headers)
+ return False
+ return True
+
+
+async def create_discussion(discussion_html, discussion_title, session: ClientSession, github_token: str) -> bool:
+ discussion_query = f"""
+ mutation {{
+ createDiscussion(
+ input: {{
+ repositoryId: "R_kgDOKFqpFQ",
+ categoryId: "DIC_kwDOKFqpFc4CYhFv",
+ body: {dumps(discussion_html)},
+ title: "{discussion_title}"
+ }}
+ ) {{
+ discussion {{
+ id
+ }}
+ }}
+ }}
+ """
+ async with session.post(
+ "https://api.github.com/graphql",
+ json={"query": discussion_query},
+ headers={
+ "Accept": "application/vnd.github+json",
+ "Authorization": f"Bearer {github_token}",
+ "Content-Type": "application/json; charset=utf-8",
+ "X-GitHub-Api-Version": "2022-11-28",
+ },
+ ) as discussion_response:
+ discussion_response_json = loads(await discussion_response.read())
+ if discussion_response_json.get("errors", None) is not None:
+ print(discussion_response_json)
+ await handle_error(discussion_response.headers)
+ return None
+ return discussion_response_json["data"]["createDiscussion"]["discussion"]["id"]
+
+
+async def handle_error(headers):
+ retry_time = fromtimestamp(int(headers.get("X-RateLimit-Reset")))
+ if retry_time > now():
+ print(f"Retry time is {retry_time - now()}; waiting for 1 minute...")
+ await sleep(60)
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser()
+ parser.add_argument("--github_issue_number")
+ parser.add_argument("--github_token")
+ args = parser.parse_args()
+ run(main(int(args.github_issue_number), args.github_token))
+
+
+# TODO change ids to match hail repo using following queries
+
+# query {
+# repository (name: "test-process", owner: "iris-garden") {
+# id
+# name
+# }
+# }
+
+# query {
+# repository (name: "test-process", owner: "iris-garden") {
+# discussionCategories (first: 100) {
+# edges {
+# node {
+# name
+# id
+# }
+# }
+# }
+# }
+# }
+
+# query {
+# repository (name: "test-process", owner: "iris-garden") {
+# labels (first: 100) {
+# edges {
+# node {
+# id
+# name
+# }
+# }
+# }
+# }
+# }
diff --git a/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker b/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker
index ca6ee9cea8e..1f0955d450f 100644
--- a/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker
+++ b/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker
@@ -1 +1 @@
-mock-maker-inline
\ No newline at end of file
+mock-maker-inline