diff --git a/discourse-export-script/.gitignore b/discourse-export-script/.gitignore new file mode 100644 index 00000000000..6b7d7596f4a --- /dev/null +++ b/discourse-export-script/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +discourse-export/ +uploaded/ diff --git a/discourse-export-script/discourse_download.py b/discourse-export-script/discourse_download.py new file mode 100644 index 00000000000..567eec6c5f8 --- /dev/null +++ b/discourse-export-script/discourse_download.py @@ -0,0 +1,251 @@ +from argparse import ArgumentParser +from asyncio import gather, run, sleep +from dataclasses import dataclass +from datetime import datetime +from html.parser import HTMLParser +from json import dump, loads +from re import sub +from typing import Any, Callable, Dict, List, Tuple, TypeVar + +from aiohttp import ClientSession + +strptime = datetime.strptime +strftime = datetime.strftime + +# constants +POST_LINK_ID = "f4706281-cc60-4ff0-a0b6-b803683cc24b" +COMMENT_END_ID = "917f6034-2117-4a8c-bb42-b27fd7fb5e83" + +# types +CallbackResponse = TypeVar("CallbackResponse") +Callback = Callable[[Dict[str, Any]], CallbackResponse] + + +@dataclass(frozen=True) +class DiscoursePost: + id: int + topic_id: int + username: str + created_at: str + html: str + + +@dataclass(frozen=True) +class DiscourseTopic: + id: int + slug: str + title: str + html: str + + +# html parser +class DiscourseHTMLParser(HTMLParser): + def __init__(self: "DiscourseHTMLParser") -> None: + super().__init__() + self.output_html = "" + # relative file links starting with / + self.relative_link = False + # link previews and quotes + self.aside = False + self.aside_src = None + self.aside_src_written = False + self.aside_header = False + self.aside_header_link = False + self.aside_header_link_written = False + # code blocks + self.code_block_pre = False + self.code_block_code = False + # @ mentions + self.mention = False + + def _decl_handler(self: "DiscourseHTMLParser", decl: str) -> None: + self.output_html += f"" + + def _ref_handler(self: "DiscourseHTMLParser", name: str) -> None: + self.output_html += f"&{name};" + + def _write_starttag(self: "DiscourseHTMLParser", attrs: List[Tuple[str, str]], tag: str, suffix: str) -> None: + attr_str_prefix = " " if len(attrs) > 0 else "" + attr_str = " ".join([f'{key}="{value}"' for key, value in attrs]) + self.output_html += f"<{tag}{attr_str_prefix}{attr_str}{suffix}>" + + def _starttag_handler(suffix: str = "") -> None: + def inner(self: "DiscourseHTMLParser", tag: str, attrs: List[Tuple[str, str]]) -> None: + attr_dict = dict(attrs) + if ((not self.aside) or self.aside_header) and tag == "a": + if self.aside_header and not self.aside_header_link_written: + self.aside_header_link = True + link = attr_dict.get("href", "") + if "mention" in attr_dict.get("class", ""): + self.mention = True + elif link.startswith("/"): + self.relative_link = True + elif "https://discuss.hail.is/t/" in link: + slug = link.removeprefix("https://discuss.hail.is/t/").split("/")[0] + self.output_html += f'' + else: + self._write_starttag(attrs, tag, suffix) + elif ( + self.aside + and self.aside_src is None + and (tag == "header" or (tag == "div" and "title" in attr_dict.get("class", ""))) + ): + self.aside_header = True + self.output_html += "\n" + elif self.aside_header and tag == "blockquote": + self.aside = False + self.aside_header = False + self.aside_header_link_written = False + self._write_starttag(attrs, tag, suffix) + elif self.aside_header and tag == "article": + self.aside_header = False + self.aside_header_link_written = False + elif not self.aside: + if tag == "aside": + self.aside = True + onebox_src = attr_dict.get("data-onebox-src", None) + if onebox_src is not None: + self.aside_src = onebox_src + self.output_html += f'\n' + elif tag == "pre": + self.code_block_pre = True + elif self.code_block_pre: + if tag == "code": + self.output_html += "\n\n```python\n" + self.code_block_code = True + else: + self._write_starttag(attrs, tag, suffix) + + return inner + + handle_charref = _ref_handler + handle_decl = _decl_handler + handle_entityref = _ref_handler + handle_startendtag = _starttag_handler(" /") + handle_starttag = _starttag_handler() + unknown_decl = _decl_handler + + def handle_comment(self: "DiscourseHTMLParser", data: str) -> None: + self.output_html += f"" + + def handle_data(self: "DiscourseHTMLParser", data: str) -> None: + if "https://discuss.hail.is/t/" in data: + data = sub('https://discuss.hail.is/t/([A-Za-z0-9\\-]*?)', f'{POST_LINK_ID}/\\1', data) + if self.mention: + self.output_html += f'{data.partition("@")[2]}' + elif self.aside_src is not None and not self.aside_src_written: + self.output_html += self.aside_src + self.aside_src_written = True + elif (not self.aside) or self.aside_header_link: + self.output_html += data + + def handle_endtag(self: "DiscourseHTMLParser", tag: str) -> None: + if ((not self.aside) or self.aside_header) and tag == "a": + if self.mention: + self.mention = False + elif self.relative_link: + self.relative_link = False + else: + if self.aside_header_link: + self.aside_header_link = False + self.aside_header_link_written = True + self.output_html += "" + elif tag == "aside": + self.aside = False + if self.aside_src is not None: + self.output_html += "\n" + self.aside_src = None + self.aside_src_written = False + self.aside_header_link_written = True + elif not self.aside: + if tag == "pre": + self.code_block_pre = False + elif self.code_block_pre: + if tag == "code": + self.output_html += "\n```\n\n" + self.code_block_code = False + else: + self.output_html += f"" + + def handle_pi(self: "DiscourseHTMLParser", data: str) -> None: + self.output_html += f"" + + +# main script +async def main(discourse_page: int) -> None: + async with ClientSession() as session: + pages = await run_tasks( + [parse_page(discourse_page, session)] + # [parse_page(page, session) for page in range(discourse_page + 1)] + ) + topics = await run_tasks([ + parse_topic(topic["id"], session) for page in pages for topic in page["topic_list"]["topics"] + ]) + posts = await run_tasks([ + parse_post(post["id"], session) for topic in topics for post in topic["post_stream"]["posts"] + ]) + + topic_acc = {topic["id"]: {"fields": topic, "posts": []} for topic in topics} + for post in posts: + topic_acc[post.topic_id]["posts"].append(post) + + topics = [] + for topic_id, topic in topic_acc.items(): + if topic["fields"]["slug"] != "welcome-to-the-hail-community": + topic_html = "" + for idx, post in enumerate(topic["posts"]): + parser = DiscourseHTMLParser() + parser.feed(post.html) + topic_html += f"> [!NOTE]\n> The following post was exported from discuss.hail.is, a forum for asking questions about Hail which has since been deprecated.\n\n## ({strptime(post.created_at, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%b %d, %Y at %H:%M')}) {post.username} said:\n{parser.output_html} {COMMENT_END_ID if idx < (len(topic['posts']) - 1) else ''}" + with open(f'./discourse-export/{topic["fields"]["id"]:04}_{topic["fields"]["slug"]}.json', 'w') as file: + dump( + { + "id": topic["fields"]["id"], + "slug": topic["fields"]["slug"], + "title": topic["fields"]["title"], + "html": topic_html, + }, + file, + ) + + +async def run_tasks(tasks): + result = [] + while len(tasks) != 0: + if len(tasks) > 5: + result += await gather(*tasks[:4]) + tasks = tasks[4:] + await sleep(2) + else: + result += await gather(*tasks) + tasks = [] + return result + + +async def parse_page(discourse_page: int, session: ClientSession) -> None: + async with session.get(f"https://discuss.hail.is/latest.json?page={discourse_page}") as response: + return await response.json() + + +async def parse_topic(topic_id: int, session: ClientSession) -> None: + async with session.get(f"https://discuss.hail.is/t/{topic_id}.json") as response: + return loads(await response.read()) + + +async def parse_post(post_id: int, session: ClientSession) -> None: + async with session.get(f"https://discuss.hail.is/posts/{post_id}.json") as response: + response_json = loads(await response.read()) + return DiscoursePost( + response_json["id"], + response_json["topic_id"], + response_json["username"], + response_json["created_at"], + response_json["cooked"], + ) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--page") + args = parser.parse_args() + run(main(int(args.page))) diff --git a/discourse-export-script/github_upload.py b/discourse-export-script/github_upload.py new file mode 100644 index 00000000000..b11660fa338 --- /dev/null +++ b/discourse-export-script/github_upload.py @@ -0,0 +1,231 @@ +from argparse import ArgumentParser +from asyncio import gather, run, sleep +from datetime import datetime +from json import dumps, load, loads +from os import listdir +from os.path import isfile, join +from re import findall, sub +from shutil import move + +from aiohttp import ClientSession + +now = datetime.now +fromtimestamp = datetime.fromtimestamp + +# constants +POST_LINK_ID = "f4706281-cc60-4ff0-a0b6-b803683cc24b" +COMMENT_END_ID = "917f6034-2117-4a8c-bb42-b27fd7fb5e83" + + +# TODO i think the numbering is off? +async def main(github_issue_number: int, github_token: str) -> None: + links = {} + for idx, filename in enumerate(sorted(listdir("./discourse-export"))): + if isfile(join("./discourse-export", filename)): + id, rest = filename.split("_") + slug, _ = rest.split(".") + with open(f"./discourse-export/{id}_{slug}.json", "r") as file: + links[slug] = { + "id": id, + "idx": idx, + "dests": set([slug for slug in findall(f'{POST_LINK_ID}/([A-Za-z0-9\\-]*?)\\\\"', file.read())]), + } + for slug, data in links.items(): + for dest in data["dests"]: + dest_data = links.get(dest, None) + if dest_data is None: + print( + f"broken link: {slug}->{dest} (https://github.com/hail-is/hail/issues/{github_issue_number + data['idx']})" + ) + else: + print( + f"link: {slug} (https://github.com/hail-is/hail/issues/{github_issue_number + data['idx']}) -> {dest} (https://github.com/hail-is/hail/issues/{github_issue_number + dest_data['idx']})" + ) + with open(f"./discourse-export/{data['id']}_{slug}.json", "r") as file: + json = sub( + f'{POST_LINK_ID}/{dest}"', + f"https://github.com/hail-is/hail/issues/{github_issue_number + dest_data['idx']}\\\"", + file.read(), + ) + with open(f"./discourse-export/{data['id']}_{slug}.json", "w") as file: + file.write(json) + async with ClientSession() as session: + for issue in sorted([{"slug": slug, **data} for slug, data in links.items()], key=lambda x: x["idx"]): + with open(f"./discourse-export/{issue['id']}_{issue['slug']}.json", "r") as file: + topic = load(file) + discussion_id, label_applied, comment_idx = [None, False, 0] + comments = topic["html"].split(COMMENT_END_ID) + discussion_html = comments[0] + rest_comments = comments[1:] + while discussion_id is None: + discussion_id = next( + iter(await gather(create_discussion(discussion_html, topic["title"], session, github_token))) + ) + while not label_applied: + label_applied = next(iter(await gather(apply_label(discussion_id, session, github_token)))) + while comment_idx < (len(rest_comments)): + comment_idx = next( + iter( + await gather( + add_comment(comment_idx, rest_comments[comment_idx], discussion_id, session, github_token) + ) + ) + ) + move( + f"./discourse-export/{issue['id']}_{issue['slug']}.json", + f"./uploaded/{issue['id']}_{issue['slug']}.json", + ) + + +async def add_comment(comment_idx, comment_html, discussion_id, session, github_token): + comment_query = f""" + mutation {{ + addDiscussionComment ( + input: {{ + discussionId: "{discussion_id}" + body: {dumps(comment_html)} + }} + ) {{ + comment {{ + id + }} + }} + }} + """ + async with session.post( + "https://api.github.com/graphql", + json={"query": comment_query}, + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {github_token}", + "Content-Type": "application/json; charset=utf-8", + "X-GitHub-Api-Version": "2022-11-28", + }, + ) as comment_response: + comment_response_json = loads(await comment_response.read()) + if comment_response_json.get("errors", None) is not None: + print(comment_response_json) + await handle_error(comment_response.headers) + return comment_idx + return comment_idx + 1 + + +async def apply_label(discussion_id, session, github_token): + label_query = f""" + mutation {{ + addLabelsToLabelable ( + input: {{ + labelableId: "{discussion_id}" + labelIds: ["LA_kwDOKFqpFc8AAAABajc5aQ"] + }} + ) {{ + labelable {{ + labels {{ + totalCount + }} + }} + }} + }} + """ + async with session.post( + "https://api.github.com/graphql", + json={"query": label_query}, + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {github_token}", + "Content-Type": "application/json; charset=utf-8", + "X-GitHub-Api-Version": "2022-11-28", + }, + ) as label_response: + label_response_json = loads(await label_response.read()) + if label_response_json.get("errors", None) is not None: + print(label_response_json) + await handle_error(label_response.headers) + return False + return True + + +async def create_discussion(discussion_html, discussion_title, session: ClientSession, github_token: str) -> bool: + discussion_query = f""" + mutation {{ + createDiscussion( + input: {{ + repositoryId: "R_kgDOKFqpFQ", + categoryId: "DIC_kwDOKFqpFc4CYhFv", + body: {dumps(discussion_html)}, + title: "{discussion_title}" + }} + ) {{ + discussion {{ + id + }} + }} + }} + """ + async with session.post( + "https://api.github.com/graphql", + json={"query": discussion_query}, + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {github_token}", + "Content-Type": "application/json; charset=utf-8", + "X-GitHub-Api-Version": "2022-11-28", + }, + ) as discussion_response: + discussion_response_json = loads(await discussion_response.read()) + if discussion_response_json.get("errors", None) is not None: + print(discussion_response_json) + await handle_error(discussion_response.headers) + return None + return discussion_response_json["data"]["createDiscussion"]["discussion"]["id"] + + +async def handle_error(headers): + retry_time = fromtimestamp(int(headers.get("X-RateLimit-Reset"))) + if retry_time > now(): + print(f"Retry time is {retry_time - now()}; waiting for 1 minute...") + await sleep(60) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--github_issue_number") + parser.add_argument("--github_token") + args = parser.parse_args() + run(main(int(args.github_issue_number), args.github_token)) + + +# TODO change ids to match hail repo using following queries + +# query { +# repository (name: "test-process", owner: "iris-garden") { +# id +# name +# } +# } + +# query { +# repository (name: "test-process", owner: "iris-garden") { +# discussionCategories (first: 100) { +# edges { +# node { +# name +# id +# } +# } +# } +# } +# } + +# query { +# repository (name: "test-process", owner: "iris-garden") { +# labels (first: 100) { +# edges { +# node { +# id +# name +# } +# } +# } +# } +# } diff --git a/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker b/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker index ca6ee9cea8e..1f0955d450f 100644 --- a/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker +++ b/hail/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker @@ -1 +1 @@ -mock-maker-inline \ No newline at end of file +mock-maker-inline