From 847fc4c0ae540d49398d190bc5d207c7b0a88769 Mon Sep 17 00:00:00 2001 From: "Ariel N. Lee" <103224818+arielnlee@users.noreply.github.com> Date: Mon, 6 Jan 2025 20:57:45 -0500 Subject: [PATCH 1/2] fix parse_robots_txt --- src/web_analysis/parse_robots.py | 137 +++++++++++++++++++++---------- src/web_analysis/robots_util.py | 8 +- 2 files changed, 96 insertions(+), 49 deletions(-) diff --git a/src/web_analysis/parse_robots.py b/src/web_analysis/parse_robots.py index cc4cf859..990438f0 100644 --- a/src/web_analysis/parse_robots.py +++ b/src/web_analysis/parse_robots.py @@ -1,5 +1,4 @@ import argparse -import io import gzip import json from collections import defaultdict @@ -8,25 +7,46 @@ def parse_robots_txt(robots_txt): """Parses the robots.txt to create a map of agents, sitemaps, and parsing oddities.""" - rules = {"ERRORS": []} + rules = {"ERRORS": [], "Sitemaps": []} + agent_map = {} current_agents = [] - - for line in robots_txt.splitlines(): - line = line.strip() - if line.startswith('User-agent:'): - agent = line.split(':', 1)[1].strip() - if agent not in rules: - rules[agent] = defaultdict(list) - current_agents = [agent] - elif line.startswith(('Allow:', 'Disallow:', 'Crawl-delay:')) and current_agents: + + for raw_line in robots_txt.splitlines(): + line = raw_line.split("#", 1)[0].strip() + if not line: + continue + lower_line = line.lower() + + if lower_line.startswith("user-agent:"): + agent_name_raw = line.split(":", 1)[1].strip() + agent_name = agent_name_raw.lower() + + if agent_name not in agent_map: + agent_map[agent_name] = agent_name_raw + rules[agent_name_raw] = defaultdict(list) + current_agents.append(agent_name) + + elif any( + lower_line.startswith(d + ":") for d in ["allow", "disallow", "crawl-delay"] + ): + if not current_agents: + rules["ERRORS"].append( + f"Directive '{line}' found with no preceding user-agent." + ) + continue + directive_name = lower_line.split(":", 1)[0].strip() + directive_value = line.split(":", 1)[1].strip() + directive_key = directive_name.capitalize() for agent in current_agents: - rules[agent][line.split(":")[0]].append(":".join(line.split(":")[1:]).strip()) - elif line.lower().startswith('sitemap:'): - rules.setdefault('Sitemaps', []).append(line.split(':', 1)[1].strip()) - elif line == "" or line.startswith('#'): - current_agents = [] + original_name = agent_map[agent] + rules[original_name][directive_key].append(directive_value) + + elif lower_line.startswith("sitemap:"): + sitemap_value = line.split(":", 1)[1].strip() + rules["Sitemaps"].append(sitemap_value) else: - rules["ERRORS"].append(f"Unmatched line: {line}") + rules["ERRORS"].append(f"Unmatched line: {raw_line}") + return rules @@ -39,7 +59,11 @@ def parallel_parse_robots(data): url_to_rules = {url: {} for url, txt in data.items() if not txt} # interpret the robots.txt with ThreadPoolExecutor(max_workers=10) as executor: - future_to_url = {executor.submit(parse_robots_txt, txt): url for url, txt in data.items() if txt} + future_to_url = { + executor.submit(parse_robots_txt, txt): url + for url, txt in data.items() + if txt + } for future in as_completed(future_to_url): url = future_to_url[future] try: @@ -54,9 +78,13 @@ def interpret_agent(rules): agent_disallow = [x for x in rules.get("Disallow", []) if "?" not in x] agent_allow = [x for x in rules.get("Allow", []) if "?" not in x] - if len(agent_disallow) == 0 or agent_disallow == [""] or (agent_allow == agent_disallow): + if ( + len(agent_disallow) == 0 + or agent_disallow == [""] + or (agent_allow == agent_disallow) + ): disallow_type = "none" - elif any('/' == x.strip() for x in agent_disallow) and len(agent_allow) == 0: + elif any("/" == x.strip() for x in agent_disallow) and len(agent_allow) == 0: disallow_type = "all" else: disallow_type = "some" @@ -65,7 +93,7 @@ def interpret_agent(rules): def interpret_robots(agent_rules, all_agents): - """Given the robots.txt agent rules, and a list of relevant agents + """Given the robots.txt agent rules, and a list of relevant agents (a superset of the robots.txt), determine whether they are: (1) blocked from scraping all parts of the website @@ -80,12 +108,14 @@ def interpret_robots(agent_rules, all_agents): for agent in all_agents: rule = agent_rules.get(agent) - judgement = interpret_agent(rule) if rule is not None else agent_to_judgement["*"] + judgement = ( + interpret_agent(rule) if rule is not None else agent_to_judgement["*"] + ) agent_to_judgement[agent] = judgement return agent_to_judgement - + def aggregate_robots(url_to_rules, all_agents): """Across all robots.txt, determine basic stats: (1) For each agent, how often it is explicitly mentioned @@ -93,7 +123,7 @@ def aggregate_robots(url_to_rules, all_agents): (3) For each agent, how often it is blocked from scraping part of the website (4) For each agent, how often it is NOT blocked from scraping at all """ - robots_stats = defaultdict(lambda: {'counter': 0, 'all': 0, 'some': 0, 'none': 0}) + robots_stats = defaultdict(lambda: {"counter": 0, "all": 0, "some": 0, "none": 0}) url_decisions = {} for url, robots in url_to_rules.items(): @@ -103,11 +133,11 @@ def aggregate_robots(url_to_rules, all_agents): for agent in all_agents + ["*"]: judgement = agent_to_judgement[agent] robots_stats[agent][judgement] += 1 - if agent in robots: # Missing robots.txt means nothing blocked + if agent in robots: # Missing robots.txt means nothing blocked robots_stats[agent]["counter"] += 1 - # See if *All Agents* are blocked on all content, - # or at least some agents can access some or more content, or + # See if *All Agents* are blocked on all content, + # or at least some agents can access some or more content, or # there are no blocks on any agents at all. if all(v == "all" for v in agent_to_judgement.values()): agg_judgement = "all" @@ -118,14 +148,15 @@ def aggregate_robots(url_to_rules, all_agents): robots_stats["*All Agents*"]["counter"] += 1 robots_stats["*All Agents*"][agg_judgement] += 1 url_decisions[url]["*All Agents*"] = agg_judgement - + return robots_stats, url_decisions def read_robots_file(file_path): - with gzip.open(file_path, 'rt') as file: + with gzip.open(file_path, "rt") as file: return json.load(file) + def analyze_robots(data): """Takes in {URL --> robots text} @@ -136,20 +167,34 @@ def analyze_robots(data): `some` is how many times its had some paths blocked `none` is how many times its had none paths blocked url_interpretations: {URL --> {Agent --> }} - Only includes agents seen at this URL. + Only includes agents seen at this URL. Agents not seen at this URL will inherit "*" rules, or `none` at all. """ url_to_rules = parallel_parse_robots(data) # Collect all agents - all_agents = list(set([agent for url, rules in url_to_rules.items() - for agent, _ in rules.items() if agent not in ["ERRORS", "Sitemaps", "*"]])) + all_agents = list( + set( + [ + agent + for url, rules in url_to_rules.items() + for agent, _ in rules.items() + if agent not in ["ERRORS", "Sitemaps", "*"] + ] + ) + ) robots_stats, url_decisions = aggregate_robots(url_to_rules, all_agents) # URL --> agents in its robots.txt and their decisions (all/some/none). - url_interpretations = {k: - {agent: v for agent, v in vs.items() if agent not in ["ERRORS", "Sitemaps"] and agent in list(url_to_rules[k]) + ["*All Agents*"]} - for k, vs in url_decisions.items()} + url_interpretations = { + k: { + agent: v + for agent, v in vs.items() + if agent not in ["ERRORS", "Sitemaps"] + and agent in list(url_to_rules[k]) + ["*All Agents*"] + } + for k, vs in url_decisions.items() + } return robots_stats, url_interpretations @@ -157,7 +202,9 @@ def analyze_robots(data): def main(args): data = read_robots_file(args.file_path) print(f"Total URLs: {len(data)}") - print(f"URLs with robots.txt: {sum(1 for robots_txt in data.values() if robots_txt)}") + print( + f"URLs with robots.txt: {sum(1 for robots_txt in data.values() if robots_txt)}" + ) robots_stats, url_interpretations = analyze_robots(data) @@ -168,14 +215,20 @@ def main(args): # import pdb; pdb.set_trace() # print(url_interpretations["http://www.machinenoveltranslation.com/robots.txt"]) - sorted_robots_stats = sorted(robots_stats.items(), key=lambda x: x[1]['counter'] / len(data) if len(data) > 0 else 0, reverse=True) + sorted_robots_stats = sorted( + robots_stats.items(), + key=lambda x: x[1]["counter"] / len(data) if len(data) > 0 else 0, + reverse=True, + ) - print(f"{'Agent':<30} {'Observed': <10} {'Blocked All': >15} {'Blocked Some': >15} {'Blocked None': >15}") + print( + f"{'Agent':<30} {'Observed': <10} {'Blocked All': >15} {'Blocked Some': >15} {'Blocked None': >15}" + ) for i, (agent, stats) in enumerate(sorted_robots_stats): - counter_pct = stats['counter'] / len(data) * 100 if len(data) > 0 else 0 - all_pct = stats['all'] / len(data) * 100 if len(data) > 0 else 0 - some_pct = stats['some'] / len(data) * 100 if len(data) > 0 else 0 - none_pct = stats['none'] / len(data) * 100 if len(data) > 0 else 0 + counter_pct = stats["counter"] / len(data) * 100 if len(data) > 0 else 0 + all_pct = stats["all"] / len(data) * 100 if len(data) > 0 else 0 + some_pct = stats["some"] / len(data) * 100 if len(data) > 0 else 0 + none_pct = stats["none"] / len(data) * 100 if len(data) > 0 else 0 print_outs = [ f"{agent:<20}", f"{stats['counter']:>5} ({counter_pct:5.1f}%) {'':>5} ", diff --git a/src/web_analysis/robots_util.py b/src/web_analysis/robots_util.py index 76b987f0..da6d3199 100644 --- a/src/web_analysis/robots_util.py +++ b/src/web_analysis/robots_util.py @@ -428,9 +428,7 @@ def compute_url_date_agent_status_detailed(data, relevant_agents): status_summary = defaultdict(lambda: defaultdict(lambda: defaultdict(str))) for url, date_to_robots in data.items(): url = normalize_url(url) - _, parsed_result = robots_stats, url_interpretations = ( - parse_robots.analyze_robots(date_to_robots) - ) + _, parsed_result = parse_robots.analyze_robots(date_to_robots) for date_str, agent_to_status in parsed_result.items(): date = pd.to_datetime(date_str) for agent in relevant_agents: @@ -601,7 +599,6 @@ def prepare_robots_temporal_summary_detailed( return filled_status_summary - def robots_temporal_to_df(filled_status_summary, strictness_order, url_to_counts={}): """ Args: @@ -668,8 +665,6 @@ def robots_temporal_to_df(filled_status_summary, strictness_order, url_to_counts return summary_df - - def get_latest_url_robot_statuses(url_robots_summary, agents): # {URL --> Date --> Agent --> Status} # URL —> status @@ -811,7 +806,6 @@ def get_tos_url_time_verdicts( return url_to_time_to_verdict - def prepare_tos_temporal_summary( tos_time_verdicts, start_time, From 966c11437c9ace96bb652e69a2c2527f204f1498 Mon Sep 17 00:00:00 2001 From: "Ariel N. Lee" <103224818+arielnlee@users.noreply.github.com> Date: Mon, 6 Jan 2025 21:04:28 -0500 Subject: [PATCH 2/2] standardize formatting for robots_util --- src/web_analysis/robots_util.py | 263 ++++++++++++++++++++------------ 1 file changed, 168 insertions(+), 95 deletions(-) diff --git a/src/web_analysis/robots_util.py b/src/web_analysis/robots_util.py index da6d3199..751451a0 100644 --- a/src/web_analysis/robots_util.py +++ b/src/web_analysis/robots_util.py @@ -63,19 +63,13 @@ "retrieval": ["ClaudeBot", "CCBot"], # https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler }, - "Cohere": { - "train": ["cohere-ai"], - "retrieval": ["cohere-ai"] - }, + "Cohere": {"train": ["cohere-ai"], "retrieval": ["cohere-ai"]}, "Meta": { "train": ["FacebookBot"], "retrieval": ["FacebookBot"], # https://developers.facebook.com/docs/sharing/bot/ }, - "Internet Archive": { - "train": ["ia_archiver"], - "retrieval": ["ia_archiver"] - }, + "Internet Archive": {"train": ["ia_archiver"], "retrieval": ["ia_archiver"]}, "Google Search": { "train": ["Googlebot"], "retrieval": ["Googlebot"], @@ -139,9 +133,11 @@ def get_bots(company=None, setting=None): ###### URL --> Token Lookup Methods ############################################################ + def normalize_url(url): return url if url.startswith("www.") else f"www.{url}" + class URLTokenLookup: def __init__(self, file_path): """ @@ -176,13 +172,19 @@ def _create_lookup_map(self): # x = df.set_index('url')[['c4_tokens', 'rf_tokens', 'dolma_tokens']] # return {row['url']: (row['c4_tokens'], row['rf_tokens'], row['dolma_tokens']) for index, row in df.iterrows()} # print(df) - df['url'] = df['url'].apply(normalize_url) + df["url"] = df["url"].apply(normalize_url) # df = df.drop_duplicates(subset='url') - df = df.groupby('url').agg({ - 'c4_tokens': 'sum', - 'rf_tokens': 'sum', - 'dolma_tokens': 'sum', - }).reset_index() + df = ( + df.groupby("url") + .agg( + { + "c4_tokens": "sum", + "rf_tokens": "sum", + "dolma_tokens": "sum", + } + ) + .reset_index() + ) # print(df) # www.apnews.com,8981058,19450645,8819143 # apnews.com,14265514,15769066,143689592 @@ -207,7 +209,6 @@ def total_tokens(self, dataset_name): """ return self._TOTAL_TOKENS[dataset_name] - def url_tokens(self, url, dataset_name): """ Get the number of tokens for a specific URL and dataset. @@ -477,7 +478,9 @@ def read_start_dates(fpath, robots_urls): # Map the sanitized URLs back to the original URLs website_start_dates = { - normalize_url(url): start_dates.get(sanitize_url(url), pd.to_datetime("1970-01-01")) + normalize_url(url): start_dates.get( + sanitize_url(url), pd.to_datetime("1970-01-01") + ) for url in robots_urls } return website_start_dates @@ -763,7 +766,7 @@ def get_tos_url_time_verdicts( tos_policies, tos_license_policies, tos_compete_policies, - manual_annotated_urls, + manual_annotated_urls, website_start_dates, ): """ @@ -799,7 +802,9 @@ def get_tos_url_time_verdicts( # print(len(url_to_time_to_verdict)) for url in manual_annotated_urls: if url in website_start_dates and url not in url_to_time_to_verdict: - url_to_time_to_verdict[url] = {website_start_dates[url].strftime("%Y-%m-%d"): "No Terms Pages"} + url_to_time_to_verdict[url] = { + website_start_dates[url].strftime("%Y-%m-%d"): "No Terms Pages" + } # print(len(url_to_time_to_verdict)) print(f"{misses} / {misses + hits} dates missed due to time mismatches.") @@ -1052,15 +1057,22 @@ def encode_latest_tos_robots_into_df( url_results_df["robots"].fillna("none", inplace=True) if robots_detailed: url_results_df["Restrictive Robots.txt"] = url_results_df["robots"].map( - {'no_robots': False, 'none': False, 'none_sitemap': False, 'none_crawl_delay': False, - 'some_pattern_restrictions': False, 'some_disallow_important_dir': False, 'some_other': False, 'all': True} + { + "no_robots": False, + "none": False, + "none_sitemap": False, + "none_crawl_delay": False, + "some_pattern_restrictions": False, + "some_disallow_important_dir": False, + "some_other": False, + "all": True, + } ) else: url_results_df["Restrictive Robots.txt"] = url_results_df["robots"].map( {"all": True, "some": False, "none": False} ) - url_results_df["ToS"] = url_results_df["URL"].map(recent_url_tos_verdicts) # print(url_results_df[url_results_df["URL"]]) # print("start") @@ -1076,9 +1088,9 @@ def encode_latest_tos_robots_into_df( lambda x: x not in ["Unrestricted Use", "No Terms Pages"] ) - url_to_tos_map = dict(zip(url_results_df['URL'], url_results_df['ToS'])) - - return url_results_df #, url_to_tos_map + url_to_tos_map = dict(zip(url_results_df["URL"], url_results_df["ToS"])) + + return url_results_df # , url_to_tos_map def plot_robots_time_map_original(df, agent_type, val_key, frequency="M"): @@ -1792,19 +1804,37 @@ def prepare_temporal_robots_for_corpus( ): urlsubset_to_robots_summary, urlsubsets = {}, {} for key, url_subset in service_to_urls.items(): - url_robots_summary_rand_detailed = {url: url_robots_summary_detailed[url] for url in random_urls if url in url_robots_summary_detailed} - url_robots_summary_rand_subset = {url: details for url, details in url_robots_summary_rand_detailed.items() if url in url_subset} - url_robots_summary_head_detailed = {url: url_robots_summary_detailed[url] for url in head_urls if url in url_robots_summary_detailed} - url_robots_summary_head_subset = {url: details for url, details in url_robots_summary_head_detailed.items() if url in url_subset} + url_robots_summary_rand_detailed = { + url: url_robots_summary_detailed[url] + for url in random_urls + if url in url_robots_summary_detailed + } + url_robots_summary_rand_subset = { + url: details + for url, details in url_robots_summary_rand_detailed.items() + if url in url_subset + } + url_robots_summary_head_detailed = { + url: url_robots_summary_detailed[url] + for url in head_urls + if url in url_robots_summary_detailed + } + url_robots_summary_head_subset = { + url: details + for url, details in url_robots_summary_head_detailed.items() + if url in url_subset + } # RANDOM # {Period --> Agent --> Status --> set(URLs)} - robots_filled_status_rand_summary_detailed = prepare_robots_temporal_summary_detailed( - url_robots_summary=url_robots_summary_rand_subset, - group_to_agents=agent_groups_to_track, - start_time=temporal_start_date, - end_time=temporal_end_date, - time_frequency="M", - website_start_dates=website_start_dates, + robots_filled_status_rand_summary_detailed = ( + prepare_robots_temporal_summary_detailed( + url_robots_summary=url_robots_summary_rand_subset, + group_to_agents=agent_groups_to_track, + start_time=temporal_start_date, + end_time=temporal_end_date, + time_frequency="M", + website_start_dates=website_start_dates, + ) ) # [Period, Agent, Status, Count, Tokens, URLs] robots_temporal_rand_summary_detailed = robots_temporal_to_df( @@ -1814,13 +1844,15 @@ def prepare_temporal_robots_for_corpus( ) # HEAD # {Period --> Agent --> Status --> set(URLs)} - robots_filled_status_head_summary_detailed = prepare_robots_temporal_summary_detailed( - url_robots_summary=url_robots_summary_head_subset, - group_to_agents=agent_groups_to_track, - start_time=temporal_start_date, - end_time=temporal_end_date, - time_frequency="M", - website_start_dates=website_start_dates, + robots_filled_status_head_summary_detailed = ( + prepare_robots_temporal_summary_detailed( + url_robots_summary=url_robots_summary_head_subset, + group_to_agents=agent_groups_to_track, + start_time=temporal_start_date, + end_time=temporal_end_date, + time_frequency="M", + website_start_dates=website_start_dates, + ) ) # [Period, Agent, Status, Count, Tokens, URLs] robots_temporal_head_summary_detailed = robots_temporal_to_df( @@ -1828,15 +1860,19 @@ def prepare_temporal_robots_for_corpus( strictness_order=robots_strictness_order, url_to_counts=url_to_counts, ) - urlsubset_to_robots_summary[f"rand-{key}"] = robots_temporal_rand_summary_detailed - urlsubset_to_robots_summary[f"head-{key}"] = robots_temporal_head_summary_detailed + urlsubset_to_robots_summary[f"rand-{key}"] = ( + robots_temporal_rand_summary_detailed + ) + urlsubset_to_robots_summary[f"head-{key}"] = ( + robots_temporal_head_summary_detailed + ) urlsubsets[f"rand-{key}"] = set(url_robots_summary_rand_subset.keys()) urlsubsets[f"head-{key}"] = set(url_robots_summary_head_subset.keys()) # if key == "all": - # print(len(url_robots_summary_head_detailed)) - # print(len(url_robots_summary_head_subset)) - # target_head_summary_df = robots_temporal_head_summary_detailed + # print(len(url_robots_summary_head_detailed)) + # print(len(url_robots_summary_head_subset)) + # target_head_summary_df = robots_temporal_head_summary_detailed return urlsubset_to_robots_summary, urlsubsets @@ -1856,19 +1892,39 @@ def prepare_temporal_tos_for_corpus( ): urlsubset_to_tos_summary, urlsubsets = {}, {} url_to_time_to_tos_verdict = get_tos_url_time_verdicts( - tos_policies, tos_license_policies, tos_compete_policies, manual_annotated_urls, website_start_dates) + tos_policies, + tos_license_policies, + tos_compete_policies, + manual_annotated_urls, + website_start_dates, + ) for key, url_subset in service_to_urls.items(): # if key != "all": # continue - url_tos_summary_rand_detailed = {url: url_to_time_to_tos_verdict[url] for url in random_urls if url in url_to_time_to_tos_verdict} - url_tos_summary_rand_subset = {url: details for url, details in url_tos_summary_rand_detailed.items() if url in url_subset} - url_tos_summary_head_detailed = {url: url_to_time_to_tos_verdict[url] for url in head_urls if url in url_to_time_to_tos_verdict} - url_tos_summary_head_subset = {url: details for url, details in url_tos_summary_head_detailed.items() if url in url_subset} + url_tos_summary_rand_detailed = { + url: url_to_time_to_tos_verdict[url] + for url in random_urls + if url in url_to_time_to_tos_verdict + } + url_tos_summary_rand_subset = { + url: details + for url, details in url_tos_summary_rand_detailed.items() + if url in url_subset + } + url_tos_summary_head_detailed = { + url: url_to_time_to_tos_verdict[url] + for url in head_urls + if url in url_to_time_to_tos_verdict + } + url_tos_summary_head_subset = { + url: details + for url, details in url_tos_summary_head_detailed.items() + if url in url_subset + } # print(len(url_tos_summary_head_subset)) # print(len(url_tos_summary_rand_subset)) - # Period --> Status --> set(URLs) period_tos_verdict_urls_head = prepare_tos_temporal_summary( url_tos_summary_head_subset, @@ -1912,7 +1968,7 @@ def prepare_temporal_tos_for_corpus( urlsubset_to_tos_summary[f"head-{key}"] = tos_summary_df_head urlsubset_to_tos_summary[f"rand-{key}"] = tos_summary_df_rand urlsubsets[f"head-{key}"] = set(url_tos_summary_head_subset.keys()) - urlsubsets[f"rand-{key}"] = set(url_tos_summary_rand_subset.keys()) + urlsubsets[f"rand-{key}"] = set(url_tos_summary_rand_subset.keys()) return urlsubset_to_tos_summary, urlsubsets @@ -1924,41 +1980,34 @@ def compute_corpus_restriction_estimates( target_agent, restrictive_statuses, save_fpath, - verbose=False + verbose=False, ): # Get the minimum and maximum date from the 'period' column # rand_df['period'] = rand_df['period'].dt.to_timestamp() - start_period = rand_df['period'].min() - end_period = rand_df['period'].max() - all_periods = pd.period_range(start=start_period, end=end_period, freq='M') + start_period = rand_df["period"].min() + end_period = rand_df["period"].max() + all_periods = pd.period_range(start=start_period, end=end_period, freq="M") # print(rand_df[rand_df['agent'] == target_agent][rand_df["period"] == "2024-04"]) - rand_restricted = rand_df \ - .loc[rand_df['agent'] == target_agent] \ - .loc[rand_df['status'].isin(restrictive_statuses)] \ - [['period', 'tokens']] - rand_restricted = rand_restricted.groupby('period').sum()['tokens'] + rand_restricted = rand_df.loc[rand_df["agent"] == target_agent].loc[ + rand_df["status"].isin(restrictive_statuses) + ][["period", "tokens"]] + rand_restricted = rand_restricted.groupby("period").sum()["tokens"] rand_restricted = rand_restricted.reindex(all_periods, fill_value=0) - - rand_all = rand_df \ - .loc[rand_df['agent'] == target_agent] \ - [['period', 'tokens']] - rand_all = rand_all.groupby('period').sum()['tokens'] - - head_restricted = head_df \ - .loc[head_df['agent'] == target_agent] \ - .loc[head_df['status'].isin(restrictive_statuses)] \ - [['period', 'tokens', 'count']] + + rand_all = rand_df.loc[rand_df["agent"] == target_agent][["period", "tokens"]] + rand_all = rand_all.groupby("period").sum()["tokens"] + + head_restricted = head_df.loc[head_df["agent"] == target_agent].loc[ + head_df["status"].isin(restrictive_statuses) + ][["period", "tokens", "count"]] if verbose: print(head_restricted[head_restricted["period"] == "2024-04"]) - head_restricted = head_restricted.groupby('period').sum()['tokens'] + head_restricted = head_restricted.groupby("period").sum()["tokens"] head_restricted = head_restricted.reindex(all_periods, fill_value=0) - - head_all = head_df \ - .loc[head_df['agent'] == target_agent] \ - [['period', 'tokens']] - head_all = head_all.groupby('period').sum()['tokens'] + head_all = head_df.loc[head_df["agent"] == target_agent][["period", "tokens"]] + head_all = head_all.groupby("period").sum()["tokens"] assert (rand_restricted.index == head_restricted.index).all() assert (rand_all.index == head_all.index).all() @@ -1974,20 +2023,25 @@ def compute_corpus_restriction_estimates( # print(save_fpath) # print(f"rand token frac = {rand_token_frac}") # print(f"head token frac = {head_token_frac}") - out = pd.concat([ - rand_frac.rename('Random'), - head_frac.rename('Head'), - (rand_token_frac * rand_frac).rename('Rand Portion'), - (head_token_frac * head_frac).rename('Head Portion'), - ((rand_token_frac * rand_frac) + (head_token_frac * head_frac)).rename('Full Corpus'), - ], axis=1) + out = pd.concat( + [ + rand_frac.rename("Random"), + head_frac.rename("Head"), + (rand_token_frac * rand_frac).rename("Rand Portion"), + (head_token_frac * head_frac).rename("Head Portion"), + ((rand_token_frac * rand_frac) + (head_token_frac * head_frac)).rename( + "Full Corpus" + ), + ], + axis=1, + ) if verbose: print(head_frac[-1]) print(rand_frac[-1]) print(head_token_frac * head_frac[-1]) print(rand_token_frac * rand_frac[-1]) print((rand_token_frac * rand_frac[-1]) + (head_token_frac * head_frac[-1])) - + # out.to_csv(f'src/analysis/{CHOSEN_CORPUS}_total_token_estimates.csv', index=True) out.to_csv(save_fpath, index=True) @@ -1999,32 +2053,51 @@ def generate_corpus_restriction_estimates_per_url_split( url_token_lookup, target_agent, robot_statuses_to_include, - save_dir="output_data_robots" + save_dir="output_data_robots", ): - + total_tokens = url_token_lookup._TOTAL_TOKENS[target_corpus.lower()] top_corpus_urls = url_token_lookup.top_k_urls(target_corpus.lower(), 2000) corpus_urls_to_counts = url_token_lookup.get_url_to_token_map(target_corpus.lower()) - num_head_tokens = sum([v for k, v in corpus_urls_to_counts.items() if k in top_corpus_urls]) + num_head_tokens = sum( + [v for k, v in corpus_urls_to_counts.items() if k in top_corpus_urls] + ) non_head_tokens = total_tokens - num_head_tokens assert non_head_tokens >= 0 - + num_tokens_splits = {} service_keys = set([k.split("-")[-1] for k in urlsubset_to_robots_summary.keys()]) for key in service_keys: url_keys = list(urlsubset_to_robots_summary[f"head-{key}"].keys()) - num_head_service_tokens = sum([v for k, v in corpus_urls_to_counts.items() if k in url_subsets[f"head-{key}"]]) - num_rand_service_tokens = sum([v for k, v in corpus_urls_to_counts.items() if k in url_subsets[f"rand-{key}"]]) + num_head_service_tokens = sum( + [ + v + for k, v in corpus_urls_to_counts.items() + if k in url_subsets[f"head-{key}"] + ] + ) + num_rand_service_tokens = sum( + [ + v + for k, v in corpus_urls_to_counts.items() + if k in url_subsets[f"rand-{key}"] + ] + ) num_tokens_splits[key] = { "head": num_head_service_tokens / num_head_tokens, "rand": num_rand_service_tokens / non_head_tokens, } # overwrite "all". - num_tokens_splits["all"] = {"head": num_head_tokens / total_tokens, "rand": non_head_tokens / total_tokens} + num_tokens_splits["all"] = { + "head": num_head_tokens / total_tokens, + "rand": non_head_tokens / total_tokens, + } for i, splitkey in enumerate(service_keys): # print(splitkey) - save_fpath = os.path.join(save_dir, f'{target_corpus}_{splitkey}_token_estimates.csv') + save_fpath = os.path.join( + save_dir, f"{target_corpus}_{splitkey}_token_estimates.csv" + ) head_df = urlsubset_to_robots_summary[f"head-{splitkey}"] rand_df = urlsubset_to_robots_summary[f"rand-{splitkey}"] # if splitkey == "all":