From 847fc4c0ae540d49398d190bc5d207c7b0a88769 Mon Sep 17 00:00:00 2001
From: "Ariel N. Lee" <103224818+arielnlee@users.noreply.github.com>
Date: Mon, 6 Jan 2025 20:57:45 -0500
Subject: [PATCH 1/2] fix parse_robots_txt

---
 src/web_analysis/parse_robots.py | 137 +++++++++++++++++++++----------
 src/web_analysis/robots_util.py  |   8 +-
 2 files changed, 96 insertions(+), 49 deletions(-)

diff --git a/src/web_analysis/parse_robots.py b/src/web_analysis/parse_robots.py
index cc4cf859..990438f0 100644
--- a/src/web_analysis/parse_robots.py
+++ b/src/web_analysis/parse_robots.py
@@ -1,5 +1,4 @@
 import argparse
-import io
 import gzip
 import json
 from collections import defaultdict
@@ -8,25 +7,46 @@
 
 def parse_robots_txt(robots_txt):
     """Parses the robots.txt to create a map of agents, sitemaps, and parsing oddities."""
-    rules = {"ERRORS": []}
+    rules = {"ERRORS": [], "Sitemaps": []}
+    agent_map = {}
     current_agents = []
-    
-    for line in robots_txt.splitlines():
-        line = line.strip()
-        if line.startswith('User-agent:'):
-            agent = line.split(':', 1)[1].strip()
-            if agent not in rules:
-                rules[agent] = defaultdict(list)
-            current_agents = [agent]
-        elif line.startswith(('Allow:', 'Disallow:', 'Crawl-delay:')) and current_agents:
+
+    for raw_line in robots_txt.splitlines():
+        line = raw_line.split("#", 1)[0].strip()
+        if not line:
+            continue
+        lower_line = line.lower()
+
+        if lower_line.startswith("user-agent:"):
+            agent_name_raw = line.split(":", 1)[1].strip()
+            agent_name = agent_name_raw.lower()
+
+            if agent_name not in agent_map:
+                agent_map[agent_name] = agent_name_raw
+                rules[agent_name_raw] = defaultdict(list)
+            current_agents.append(agent_name)
+
+        elif any(
+            lower_line.startswith(d + ":") for d in ["allow", "disallow", "crawl-delay"]
+        ):
+            if not current_agents:
+                rules["ERRORS"].append(
+                    f"Directive '{line}' found with no preceding user-agent."
+                )
+                continue
+            directive_name = lower_line.split(":", 1)[0].strip()
+            directive_value = line.split(":", 1)[1].strip()
+            directive_key = directive_name.capitalize()
             for agent in current_agents:
-                rules[agent][line.split(":")[0]].append(":".join(line.split(":")[1:]).strip())
-        elif line.lower().startswith('sitemap:'):
-            rules.setdefault('Sitemaps', []).append(line.split(':', 1)[1].strip())
-        elif line == "" or line.startswith('#'):
-            current_agents = []
+                original_name = agent_map[agent]
+                rules[original_name][directive_key].append(directive_value)
+
+        elif lower_line.startswith("sitemap:"):
+            sitemap_value = line.split(":", 1)[1].strip()
+            rules["Sitemaps"].append(sitemap_value)
         else:
-            rules["ERRORS"].append(f"Unmatched line: {line}")
+            rules["ERRORS"].append(f"Unmatched line: {raw_line}")
+
     return rules
 
 
@@ -39,7 +59,11 @@ def parallel_parse_robots(data):
     url_to_rules = {url: {} for url, txt in data.items() if not txt}
     # interpret the robots.txt
     with ThreadPoolExecutor(max_workers=10) as executor:
-        future_to_url = {executor.submit(parse_robots_txt, txt): url for url, txt in data.items() if txt}
+        future_to_url = {
+            executor.submit(parse_robots_txt, txt): url
+            for url, txt in data.items()
+            if txt
+        }
         for future in as_completed(future_to_url):
             url = future_to_url[future]
             try:
@@ -54,9 +78,13 @@ def interpret_agent(rules):
     agent_disallow = [x for x in rules.get("Disallow", []) if "?" not in x]
     agent_allow = [x for x in rules.get("Allow", []) if "?" not in x]
 
-    if len(agent_disallow) == 0 or agent_disallow == [""] or (agent_allow == agent_disallow):
+    if (
+        len(agent_disallow) == 0
+        or agent_disallow == [""]
+        or (agent_allow == agent_disallow)
+    ):
         disallow_type = "none"
-    elif any('/' == x.strip() for x in agent_disallow) and len(agent_allow) == 0:
+    elif any("/" == x.strip() for x in agent_disallow) and len(agent_allow) == 0:
         disallow_type = "all"
     else:
         disallow_type = "some"
@@ -65,7 +93,7 @@ def interpret_agent(rules):
 
 
 def interpret_robots(agent_rules, all_agents):
-    """Given the robots.txt agent rules, and a list of relevant agents 
+    """Given the robots.txt agent rules, and a list of relevant agents
     (a superset of the robots.txt), determine whether they are:
 
     (1) blocked from scraping all parts of the website
@@ -80,12 +108,14 @@ def interpret_robots(agent_rules, all_agents):
 
     for agent in all_agents:
         rule = agent_rules.get(agent)
-        judgement = interpret_agent(rule) if rule is not None else agent_to_judgement["*"]
+        judgement = (
+            interpret_agent(rule) if rule is not None else agent_to_judgement["*"]
+        )
         agent_to_judgement[agent] = judgement
 
     return agent_to_judgement
 
-        
+
 def aggregate_robots(url_to_rules, all_agents):
     """Across all robots.txt, determine basic stats:
     (1) For each agent, how often it is explicitly mentioned
@@ -93,7 +123,7 @@ def aggregate_robots(url_to_rules, all_agents):
     (3) For each agent, how often it is blocked from scraping part of the website
     (4) For each agent, how often it is NOT blocked from scraping at all
     """
-    robots_stats = defaultdict(lambda: {'counter': 0, 'all': 0, 'some': 0, 'none': 0})
+    robots_stats = defaultdict(lambda: {"counter": 0, "all": 0, "some": 0, "none": 0})
     url_decisions = {}
 
     for url, robots in url_to_rules.items():
@@ -103,11 +133,11 @@ def aggregate_robots(url_to_rules, all_agents):
         for agent in all_agents + ["*"]:
             judgement = agent_to_judgement[agent]
             robots_stats[agent][judgement] += 1
-            if agent in robots: # Missing robots.txt means nothing blocked
+            if agent in robots:  # Missing robots.txt means nothing blocked
                 robots_stats[agent]["counter"] += 1
 
-        # See if *All Agents* are blocked on all content, 
-        # or at least some agents can access some or more content, or 
+        # See if *All Agents* are blocked on all content,
+        # or at least some agents can access some or more content, or
         # there are no blocks on any agents at all.
         if all(v == "all" for v in agent_to_judgement.values()):
             agg_judgement = "all"
@@ -118,14 +148,15 @@ def aggregate_robots(url_to_rules, all_agents):
         robots_stats["*All Agents*"]["counter"] += 1
         robots_stats["*All Agents*"][agg_judgement] += 1
         url_decisions[url]["*All Agents*"] = agg_judgement
-        
+
     return robots_stats, url_decisions
 
 
 def read_robots_file(file_path):
-    with gzip.open(file_path, 'rt') as file:
+    with gzip.open(file_path, "rt") as file:
         return json.load(file)
 
+
 def analyze_robots(data):
     """Takes in {URL --> robots text}
 
@@ -136,20 +167,34 @@ def analyze_robots(data):
             `some` is how many times its had some paths blocked
             `none` is how many times its had none paths blocked
         url_interpretations: {URL --> {Agent --> <all/some/none>}}
-            Only includes agents seen at this URL. 
+            Only includes agents seen at this URL.
             Agents not seen at this URL will inherit "*" rules, or `none` at all.
     """
     url_to_rules = parallel_parse_robots(data)
     # Collect all agents
-    all_agents = list(set([agent for url, rules in url_to_rules.items() 
-                      for agent, _ in rules.items() if agent not in ["ERRORS", "Sitemaps", "*"]]))
+    all_agents = list(
+        set(
+            [
+                agent
+                for url, rules in url_to_rules.items()
+                for agent, _ in rules.items()
+                if agent not in ["ERRORS", "Sitemaps", "*"]
+            ]
+        )
+    )
 
     robots_stats, url_decisions = aggregate_robots(url_to_rules, all_agents)
 
     # URL --> agents in its robots.txt and their decisions (all/some/none).
-    url_interpretations = {k: 
-        {agent: v for agent, v in vs.items() if agent not in ["ERRORS", "Sitemaps"] and agent in list(url_to_rules[k]) + ["*All Agents*"]} 
-        for k, vs in url_decisions.items()}
+    url_interpretations = {
+        k: {
+            agent: v
+            for agent, v in vs.items()
+            if agent not in ["ERRORS", "Sitemaps"]
+            and agent in list(url_to_rules[k]) + ["*All Agents*"]
+        }
+        for k, vs in url_decisions.items()
+    }
 
     return robots_stats, url_interpretations
 
@@ -157,7 +202,9 @@ def analyze_robots(data):
 def main(args):
     data = read_robots_file(args.file_path)
     print(f"Total URLs: {len(data)}")
-    print(f"URLs with robots.txt: {sum(1 for robots_txt in data.values() if robots_txt)}")
+    print(
+        f"URLs with robots.txt: {sum(1 for robots_txt in data.values() if robots_txt)}"
+    )
 
     robots_stats, url_interpretations = analyze_robots(data)
 
@@ -168,14 +215,20 @@ def main(args):
     # import pdb; pdb.set_trace()
     # print(url_interpretations["http://www.machinenoveltranslation.com/robots.txt"])
 
-    sorted_robots_stats = sorted(robots_stats.items(), key=lambda x: x[1]['counter'] / len(data) if len(data) > 0 else 0, reverse=True)
+    sorted_robots_stats = sorted(
+        robots_stats.items(),
+        key=lambda x: x[1]["counter"] / len(data) if len(data) > 0 else 0,
+        reverse=True,
+    )
 
-    print(f"{'Agent':<30} {'Observed': <10} {'Blocked All': >15} {'Blocked Some': >15} {'Blocked None': >15}")
+    print(
+        f"{'Agent':<30} {'Observed': <10} {'Blocked All': >15} {'Blocked Some': >15} {'Blocked None': >15}"
+    )
     for i, (agent, stats) in enumerate(sorted_robots_stats):
-        counter_pct = stats['counter'] / len(data) * 100 if len(data) > 0 else 0
-        all_pct = stats['all'] / len(data) * 100 if len(data) > 0 else 0
-        some_pct = stats['some'] / len(data) * 100 if len(data) > 0 else 0
-        none_pct = stats['none'] / len(data) * 100 if len(data) > 0 else 0
+        counter_pct = stats["counter"] / len(data) * 100 if len(data) > 0 else 0
+        all_pct = stats["all"] / len(data) * 100 if len(data) > 0 else 0
+        some_pct = stats["some"] / len(data) * 100 if len(data) > 0 else 0
+        none_pct = stats["none"] / len(data) * 100 if len(data) > 0 else 0
         print_outs = [
             f"{agent:<20}",
             f"{stats['counter']:>5} ({counter_pct:5.1f}%) {'':>5} ",
diff --git a/src/web_analysis/robots_util.py b/src/web_analysis/robots_util.py
index 76b987f0..da6d3199 100644
--- a/src/web_analysis/robots_util.py
+++ b/src/web_analysis/robots_util.py
@@ -428,9 +428,7 @@ def compute_url_date_agent_status_detailed(data, relevant_agents):
     status_summary = defaultdict(lambda: defaultdict(lambda: defaultdict(str)))
     for url, date_to_robots in data.items():
         url = normalize_url(url)
-        _, parsed_result = robots_stats, url_interpretations = (
-            parse_robots.analyze_robots(date_to_robots)
-        )
+        _, parsed_result = parse_robots.analyze_robots(date_to_robots)
         for date_str, agent_to_status in parsed_result.items():
             date = pd.to_datetime(date_str)
             for agent in relevant_agents:
@@ -601,7 +599,6 @@ def prepare_robots_temporal_summary_detailed(
     return filled_status_summary
 
 
-
 def robots_temporal_to_df(filled_status_summary, strictness_order, url_to_counts={}):
     """
     Args:
@@ -668,8 +665,6 @@ def robots_temporal_to_df(filled_status_summary, strictness_order, url_to_counts
     return summary_df
 
 
-
-
 def get_latest_url_robot_statuses(url_robots_summary, agents):
     # {URL --> Date --> Agent --> Status}
     # URL —> status
@@ -811,7 +806,6 @@ def get_tos_url_time_verdicts(
     return url_to_time_to_verdict
 
 
-
 def prepare_tos_temporal_summary(
     tos_time_verdicts,
     start_time,

From 966c11437c9ace96bb652e69a2c2527f204f1498 Mon Sep 17 00:00:00 2001
From: "Ariel N. Lee" <103224818+arielnlee@users.noreply.github.com>
Date: Mon, 6 Jan 2025 21:04:28 -0500
Subject: [PATCH 2/2] standardize formatting for robots_util

---
 src/web_analysis/robots_util.py | 263 ++++++++++++++++++++------------
 1 file changed, 168 insertions(+), 95 deletions(-)

diff --git a/src/web_analysis/robots_util.py b/src/web_analysis/robots_util.py
index da6d3199..751451a0 100644
--- a/src/web_analysis/robots_util.py
+++ b/src/web_analysis/robots_util.py
@@ -63,19 +63,13 @@
         "retrieval": ["ClaudeBot", "CCBot"],
         # https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler
     },
-    "Cohere": {
-        "train": ["cohere-ai"],
-        "retrieval": ["cohere-ai"]
-    },
+    "Cohere": {"train": ["cohere-ai"], "retrieval": ["cohere-ai"]},
     "Meta": {
         "train": ["FacebookBot"],
         "retrieval": ["FacebookBot"],
         # https://developers.facebook.com/docs/sharing/bot/
     },
-    "Internet Archive": {
-        "train": ["ia_archiver"],
-        "retrieval": ["ia_archiver"]
-    },
+    "Internet Archive": {"train": ["ia_archiver"], "retrieval": ["ia_archiver"]},
     "Google Search": {
         "train": ["Googlebot"],
         "retrieval": ["Googlebot"],
@@ -139,9 +133,11 @@ def get_bots(company=None, setting=None):
 ###### URL --> Token Lookup Methods
 ############################################################
 
+
 def normalize_url(url):
     return url if url.startswith("www.") else f"www.{url}"
 
+
 class URLTokenLookup:
     def __init__(self, file_path):
         """
@@ -176,13 +172,19 @@ def _create_lookup_map(self):
         # x = df.set_index('url')[['c4_tokens', 'rf_tokens', 'dolma_tokens']]
         # return {row['url']: (row['c4_tokens'], row['rf_tokens'], row['dolma_tokens']) for index, row in df.iterrows()}
         # print(df)
-        df['url'] = df['url'].apply(normalize_url)
+        df["url"] = df["url"].apply(normalize_url)
         # df = df.drop_duplicates(subset='url')
-        df = df.groupby('url').agg({
-            'c4_tokens': 'sum',
-            'rf_tokens': 'sum',
-            'dolma_tokens': 'sum',
-        }).reset_index()
+        df = (
+            df.groupby("url")
+            .agg(
+                {
+                    "c4_tokens": "sum",
+                    "rf_tokens": "sum",
+                    "dolma_tokens": "sum",
+                }
+            )
+            .reset_index()
+        )
         # print(df)
         # www.apnews.com,8981058,19450645,8819143
         # apnews.com,14265514,15769066,143689592
@@ -207,7 +209,6 @@ def total_tokens(self, dataset_name):
         """
         return self._TOTAL_TOKENS[dataset_name]
 
-
     def url_tokens(self, url, dataset_name):
         """
         Get the number of tokens for a specific URL and dataset.
@@ -477,7 +478,9 @@ def read_start_dates(fpath, robots_urls):
 
     # Map the sanitized URLs back to the original URLs
     website_start_dates = {
-        normalize_url(url): start_dates.get(sanitize_url(url), pd.to_datetime("1970-01-01"))
+        normalize_url(url): start_dates.get(
+            sanitize_url(url), pd.to_datetime("1970-01-01")
+        )
         for url in robots_urls
     }
     return website_start_dates
@@ -763,7 +766,7 @@ def get_tos_url_time_verdicts(
     tos_policies,
     tos_license_policies,
     tos_compete_policies,
-    manual_annotated_urls, 
+    manual_annotated_urls,
     website_start_dates,
 ):
     """
@@ -799,7 +802,9 @@ def get_tos_url_time_verdicts(
     # print(len(url_to_time_to_verdict))
     for url in manual_annotated_urls:
         if url in website_start_dates and url not in url_to_time_to_verdict:
-            url_to_time_to_verdict[url] = {website_start_dates[url].strftime("%Y-%m-%d"): "No Terms Pages"}
+            url_to_time_to_verdict[url] = {
+                website_start_dates[url].strftime("%Y-%m-%d"): "No Terms Pages"
+            }
     # print(len(url_to_time_to_verdict))
 
     print(f"{misses} / {misses + hits} dates missed due to time mismatches.")
@@ -1052,15 +1057,22 @@ def encode_latest_tos_robots_into_df(
     url_results_df["robots"].fillna("none", inplace=True)
     if robots_detailed:
         url_results_df["Restrictive Robots.txt"] = url_results_df["robots"].map(
-            {'no_robots': False, 'none': False, 'none_sitemap': False, 'none_crawl_delay': False, 
-            'some_pattern_restrictions': False, 'some_disallow_important_dir': False, 'some_other': False, 'all': True}
+            {
+                "no_robots": False,
+                "none": False,
+                "none_sitemap": False,
+                "none_crawl_delay": False,
+                "some_pattern_restrictions": False,
+                "some_disallow_important_dir": False,
+                "some_other": False,
+                "all": True,
+            }
         )
     else:
         url_results_df["Restrictive Robots.txt"] = url_results_df["robots"].map(
             {"all": True, "some": False, "none": False}
         )
 
-    
     url_results_df["ToS"] = url_results_df["URL"].map(recent_url_tos_verdicts)
     # print(url_results_df[url_results_df["URL"]])
     # print("start")
@@ -1076,9 +1088,9 @@ def encode_latest_tos_robots_into_df(
         lambda x: x not in ["Unrestricted Use", "No Terms Pages"]
     )
 
-    url_to_tos_map = dict(zip(url_results_df['URL'], url_results_df['ToS']))
-    
-    return url_results_df #, url_to_tos_map
+    url_to_tos_map = dict(zip(url_results_df["URL"], url_results_df["ToS"]))
+
+    return url_results_df  # , url_to_tos_map
 
 
 def plot_robots_time_map_original(df, agent_type, val_key, frequency="M"):
@@ -1792,19 +1804,37 @@ def prepare_temporal_robots_for_corpus(
 ):
     urlsubset_to_robots_summary, urlsubsets = {}, {}
     for key, url_subset in service_to_urls.items():
-        url_robots_summary_rand_detailed = {url: url_robots_summary_detailed[url] for url in random_urls if url in url_robots_summary_detailed}
-        url_robots_summary_rand_subset = {url: details for url, details in url_robots_summary_rand_detailed.items() if url in url_subset}
-        url_robots_summary_head_detailed = {url: url_robots_summary_detailed[url] for url in head_urls if url in url_robots_summary_detailed}
-        url_robots_summary_head_subset = {url: details for url, details in url_robots_summary_head_detailed.items() if url in url_subset}
+        url_robots_summary_rand_detailed = {
+            url: url_robots_summary_detailed[url]
+            for url in random_urls
+            if url in url_robots_summary_detailed
+        }
+        url_robots_summary_rand_subset = {
+            url: details
+            for url, details in url_robots_summary_rand_detailed.items()
+            if url in url_subset
+        }
+        url_robots_summary_head_detailed = {
+            url: url_robots_summary_detailed[url]
+            for url in head_urls
+            if url in url_robots_summary_detailed
+        }
+        url_robots_summary_head_subset = {
+            url: details
+            for url, details in url_robots_summary_head_detailed.items()
+            if url in url_subset
+        }
         # RANDOM
         # {Period --> Agent --> Status --> set(URLs)}
-        robots_filled_status_rand_summary_detailed = prepare_robots_temporal_summary_detailed(
-            url_robots_summary=url_robots_summary_rand_subset,
-            group_to_agents=agent_groups_to_track,
-            start_time=temporal_start_date,
-            end_time=temporal_end_date,
-            time_frequency="M",
-            website_start_dates=website_start_dates,
+        robots_filled_status_rand_summary_detailed = (
+            prepare_robots_temporal_summary_detailed(
+                url_robots_summary=url_robots_summary_rand_subset,
+                group_to_agents=agent_groups_to_track,
+                start_time=temporal_start_date,
+                end_time=temporal_end_date,
+                time_frequency="M",
+                website_start_dates=website_start_dates,
+            )
         )
         # [Period, Agent, Status, Count, Tokens, URLs]
         robots_temporal_rand_summary_detailed = robots_temporal_to_df(
@@ -1814,13 +1844,15 @@ def prepare_temporal_robots_for_corpus(
         )
         # HEAD
         # {Period --> Agent --> Status --> set(URLs)}
-        robots_filled_status_head_summary_detailed = prepare_robots_temporal_summary_detailed(
-            url_robots_summary=url_robots_summary_head_subset,
-            group_to_agents=agent_groups_to_track,
-            start_time=temporal_start_date,
-            end_time=temporal_end_date,
-            time_frequency="M",
-            website_start_dates=website_start_dates,
+        robots_filled_status_head_summary_detailed = (
+            prepare_robots_temporal_summary_detailed(
+                url_robots_summary=url_robots_summary_head_subset,
+                group_to_agents=agent_groups_to_track,
+                start_time=temporal_start_date,
+                end_time=temporal_end_date,
+                time_frequency="M",
+                website_start_dates=website_start_dates,
+            )
         )
         # [Period, Agent, Status, Count, Tokens, URLs]
         robots_temporal_head_summary_detailed = robots_temporal_to_df(
@@ -1828,15 +1860,19 @@ def prepare_temporal_robots_for_corpus(
             strictness_order=robots_strictness_order,
             url_to_counts=url_to_counts,
         )
-        urlsubset_to_robots_summary[f"rand-{key}"] = robots_temporal_rand_summary_detailed
-        urlsubset_to_robots_summary[f"head-{key}"] = robots_temporal_head_summary_detailed
+        urlsubset_to_robots_summary[f"rand-{key}"] = (
+            robots_temporal_rand_summary_detailed
+        )
+        urlsubset_to_robots_summary[f"head-{key}"] = (
+            robots_temporal_head_summary_detailed
+        )
         urlsubsets[f"rand-{key}"] = set(url_robots_summary_rand_subset.keys())
         urlsubsets[f"head-{key}"] = set(url_robots_summary_head_subset.keys())
 
         # if key == "all":
-            # print(len(url_robots_summary_head_detailed))
-            # print(len(url_robots_summary_head_subset))
-            # target_head_summary_df = robots_temporal_head_summary_detailed
+        # print(len(url_robots_summary_head_detailed))
+        # print(len(url_robots_summary_head_subset))
+        # target_head_summary_df = robots_temporal_head_summary_detailed
     return urlsubset_to_robots_summary, urlsubsets
 
 
@@ -1856,19 +1892,39 @@ def prepare_temporal_tos_for_corpus(
 ):
     urlsubset_to_tos_summary, urlsubsets = {}, {}
     url_to_time_to_tos_verdict = get_tos_url_time_verdicts(
-        tos_policies, tos_license_policies, tos_compete_policies, manual_annotated_urls, website_start_dates)
+        tos_policies,
+        tos_license_policies,
+        tos_compete_policies,
+        manual_annotated_urls,
+        website_start_dates,
+    )
     for key, url_subset in service_to_urls.items():
         # if key != "all":
         #     continue
 
-        url_tos_summary_rand_detailed = {url: url_to_time_to_tos_verdict[url] for url in random_urls if url in url_to_time_to_tos_verdict}
-        url_tos_summary_rand_subset = {url: details for url, details in url_tos_summary_rand_detailed.items() if url in url_subset}
-        url_tos_summary_head_detailed = {url: url_to_time_to_tos_verdict[url] for url in head_urls if url in url_to_time_to_tos_verdict}
-        url_tos_summary_head_subset = {url: details for url, details in url_tos_summary_head_detailed.items() if url in url_subset}
+        url_tos_summary_rand_detailed = {
+            url: url_to_time_to_tos_verdict[url]
+            for url in random_urls
+            if url in url_to_time_to_tos_verdict
+        }
+        url_tos_summary_rand_subset = {
+            url: details
+            for url, details in url_tos_summary_rand_detailed.items()
+            if url in url_subset
+        }
+        url_tos_summary_head_detailed = {
+            url: url_to_time_to_tos_verdict[url]
+            for url in head_urls
+            if url in url_to_time_to_tos_verdict
+        }
+        url_tos_summary_head_subset = {
+            url: details
+            for url, details in url_tos_summary_head_detailed.items()
+            if url in url_subset
+        }
         # print(len(url_tos_summary_head_subset))
         # print(len(url_tos_summary_rand_subset))
 
-
         # Period --> Status --> set(URLs)
         period_tos_verdict_urls_head = prepare_tos_temporal_summary(
             url_tos_summary_head_subset,
@@ -1912,7 +1968,7 @@ def prepare_temporal_tos_for_corpus(
         urlsubset_to_tos_summary[f"head-{key}"] = tos_summary_df_head
         urlsubset_to_tos_summary[f"rand-{key}"] = tos_summary_df_rand
         urlsubsets[f"head-{key}"] = set(url_tos_summary_head_subset.keys())
-        urlsubsets[f"rand-{key}"] = set(url_tos_summary_rand_subset.keys())        
+        urlsubsets[f"rand-{key}"] = set(url_tos_summary_rand_subset.keys())
     return urlsubset_to_tos_summary, urlsubsets
 
 
@@ -1924,41 +1980,34 @@ def compute_corpus_restriction_estimates(
     target_agent,
     restrictive_statuses,
     save_fpath,
-    verbose=False
+    verbose=False,
 ):
     # Get the minimum and maximum date from the 'period' column
     # rand_df['period'] = rand_df['period'].dt.to_timestamp()
-    start_period = rand_df['period'].min()
-    end_period = rand_df['period'].max()
-    all_periods = pd.period_range(start=start_period, end=end_period, freq='M')
+    start_period = rand_df["period"].min()
+    end_period = rand_df["period"].max()
+    all_periods = pd.period_range(start=start_period, end=end_period, freq="M")
 
     # print(rand_df[rand_df['agent'] == target_agent][rand_df["period"] == "2024-04"])
-    rand_restricted = rand_df \
-        .loc[rand_df['agent'] == target_agent] \
-        .loc[rand_df['status'].isin(restrictive_statuses)] \
-        [['period', 'tokens']]
-    rand_restricted = rand_restricted.groupby('period').sum()['tokens']
+    rand_restricted = rand_df.loc[rand_df["agent"] == target_agent].loc[
+        rand_df["status"].isin(restrictive_statuses)
+    ][["period", "tokens"]]
+    rand_restricted = rand_restricted.groupby("period").sum()["tokens"]
     rand_restricted = rand_restricted.reindex(all_periods, fill_value=0)
-    
-    rand_all = rand_df \
-        .loc[rand_df['agent'] == target_agent] \
-        [['period', 'tokens']]
-    rand_all = rand_all.groupby('period').sum()['tokens']
-    
-    head_restricted = head_df \
-        .loc[head_df['agent'] == target_agent] \
-        .loc[head_df['status'].isin(restrictive_statuses)] \
-        [['period', 'tokens', 'count']]
+
+    rand_all = rand_df.loc[rand_df["agent"] == target_agent][["period", "tokens"]]
+    rand_all = rand_all.groupby("period").sum()["tokens"]
+
+    head_restricted = head_df.loc[head_df["agent"] == target_agent].loc[
+        head_df["status"].isin(restrictive_statuses)
+    ][["period", "tokens", "count"]]
     if verbose:
         print(head_restricted[head_restricted["period"] == "2024-04"])
-    head_restricted = head_restricted.groupby('period').sum()['tokens']
+    head_restricted = head_restricted.groupby("period").sum()["tokens"]
     head_restricted = head_restricted.reindex(all_periods, fill_value=0)
 
-    
-    head_all = head_df \
-        .loc[head_df['agent'] == target_agent] \
-        [['period', 'tokens']]
-    head_all = head_all.groupby('period').sum()['tokens']
+    head_all = head_df.loc[head_df["agent"] == target_agent][["period", "tokens"]]
+    head_all = head_all.groupby("period").sum()["tokens"]
 
     assert (rand_restricted.index == head_restricted.index).all()
     assert (rand_all.index == head_all.index).all()
@@ -1974,20 +2023,25 @@ def compute_corpus_restriction_estimates(
     # print(save_fpath)
     # print(f"rand token frac = {rand_token_frac}")
     # print(f"head token frac = {head_token_frac}")
-    out = pd.concat([
-        rand_frac.rename('Random'),
-        head_frac.rename('Head'),
-        (rand_token_frac * rand_frac).rename('Rand Portion'),
-        (head_token_frac * head_frac).rename('Head Portion'),
-        ((rand_token_frac * rand_frac) + (head_token_frac * head_frac)).rename('Full Corpus'),
-    ], axis=1)
+    out = pd.concat(
+        [
+            rand_frac.rename("Random"),
+            head_frac.rename("Head"),
+            (rand_token_frac * rand_frac).rename("Rand Portion"),
+            (head_token_frac * head_frac).rename("Head Portion"),
+            ((rand_token_frac * rand_frac) + (head_token_frac * head_frac)).rename(
+                "Full Corpus"
+            ),
+        ],
+        axis=1,
+    )
     if verbose:
         print(head_frac[-1])
         print(rand_frac[-1])
         print(head_token_frac * head_frac[-1])
         print(rand_token_frac * rand_frac[-1])
         print((rand_token_frac * rand_frac[-1]) + (head_token_frac * head_frac[-1]))
-    
+
     # out.to_csv(f'src/analysis/{CHOSEN_CORPUS}_total_token_estimates.csv', index=True)
     out.to_csv(save_fpath, index=True)
 
@@ -1999,32 +2053,51 @@ def generate_corpus_restriction_estimates_per_url_split(
     url_token_lookup,
     target_agent,
     robot_statuses_to_include,
-    save_dir="output_data_robots"
+    save_dir="output_data_robots",
 ):
-    
+
     total_tokens = url_token_lookup._TOTAL_TOKENS[target_corpus.lower()]
     top_corpus_urls = url_token_lookup.top_k_urls(target_corpus.lower(), 2000)
     corpus_urls_to_counts = url_token_lookup.get_url_to_token_map(target_corpus.lower())
-    num_head_tokens = sum([v for k, v in corpus_urls_to_counts.items() if k in top_corpus_urls])
+    num_head_tokens = sum(
+        [v for k, v in corpus_urls_to_counts.items() if k in top_corpus_urls]
+    )
     non_head_tokens = total_tokens - num_head_tokens
     assert non_head_tokens >= 0
-    
+
     num_tokens_splits = {}
     service_keys = set([k.split("-")[-1] for k in urlsubset_to_robots_summary.keys()])
     for key in service_keys:
         url_keys = list(urlsubset_to_robots_summary[f"head-{key}"].keys())
-        num_head_service_tokens = sum([v for k, v in corpus_urls_to_counts.items() if k in url_subsets[f"head-{key}"]])
-        num_rand_service_tokens = sum([v for k, v in corpus_urls_to_counts.items() if k in url_subsets[f"rand-{key}"]])
+        num_head_service_tokens = sum(
+            [
+                v
+                for k, v in corpus_urls_to_counts.items()
+                if k in url_subsets[f"head-{key}"]
+            ]
+        )
+        num_rand_service_tokens = sum(
+            [
+                v
+                for k, v in corpus_urls_to_counts.items()
+                if k in url_subsets[f"rand-{key}"]
+            ]
+        )
         num_tokens_splits[key] = {
             "head": num_head_service_tokens / num_head_tokens,
             "rand": num_rand_service_tokens / non_head_tokens,
         }
     # overwrite "all".
-    num_tokens_splits["all"] = {"head": num_head_tokens / total_tokens, "rand": non_head_tokens / total_tokens}
+    num_tokens_splits["all"] = {
+        "head": num_head_tokens / total_tokens,
+        "rand": non_head_tokens / total_tokens,
+    }
 
     for i, splitkey in enumerate(service_keys):
         # print(splitkey)
-        save_fpath = os.path.join(save_dir, f'{target_corpus}_{splitkey}_token_estimates.csv')
+        save_fpath = os.path.join(
+            save_dir, f"{target_corpus}_{splitkey}_token_estimates.csv"
+        )
         head_df = urlsubset_to_robots_summary[f"head-{splitkey}"]
         rand_df = urlsubset_to_robots_summary[f"rand-{splitkey}"]
         # if splitkey == "all":