diff --git a/README.md b/README.md index 3c421a7..1dca7bc 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ The program requires three input files to operate: For the English Wikipedia, you can get these at as: -* enwiki-yyyy-mm-dd-page.sql.gz -* enwiki-yyyy-mm-dd-redirect.sql.gz -* enwiki-yyyy-mm-dd-pagelinks.sql.gz +* enwiki-yyyymmdd-page.sql.gz +* enwiki-yyyymmdd-redirect.sql.gz +* enwiki-yyyymmdd-pagelinks.sql.gz ### Hardware @@ -62,9 +62,15 @@ Custom (128 MiB) buffer size and a link-count cutoff of 185K, below which pages wikidigest-link-count -p page.sql.gz -r redirect.sql.gz -l pagelinks.sql.gz -o /tmp/185k-or-more -b 128 -c 185000 ``` +Export as different format ([WikiText](https://en.wikipedia.org/wiki/Help:Wikitext) table): + +``` +wikidigest-link-count -p page.sql -r redirect.sql -l pagelinks.sql -e wikitext +``` + ## Results -Results are written to a plain text file. +Results are written to an output file, by default as Plaintext to `./results.txt`. Below results for the English Wikipedia, Apr 2020 – pages with 200K or more incoming links within the main (0) namespace: diff --git a/notes.md b/notes.md index ad91145..0f8788f 100644 --- a/notes.md +++ b/notes.md @@ -24,6 +24,4 @@ - XML - JSON - HTML (table) - - Markdown (table) - - Wiki (table) diff --git a/src/buffer_queue.rs b/src/buffer_queue.rs index 2d62dfe..78a0134 100644 --- a/src/buffer_queue.rs +++ b/src/buffer_queue.rs @@ -10,7 +10,7 @@ const QUERY_INTERVAL_MS: u64 = 250; pub struct BufferQueue { buffers: Vec>, - queue: Mutex> + queue: Mutex>, } impl BufferQueue { @@ -25,7 +25,7 @@ impl BufferQueue { Self { buffers, - queue: Mutex::new(queue) + queue: Mutex::new(queue), } } @@ -34,7 +34,7 @@ impl BufferQueue { Buffer { id, inner: &self.buffers[id], - queue: &self.queue + queue: &self.queue, } } @@ -57,7 +57,7 @@ Once released, it pushes its ID back into the shared queue, allowing the resourc pub struct Buffer<'a> { id: usize, inner: &'a Mutex, - queue: &'a Mutex> + queue: &'a Mutex>, } impl<'a> Buffer<'a> { diff --git a/src/chunked_reader.rs b/src/chunked_reader.rs index 879f917..2c1c220 100644 --- a/src/chunked_reader.rs +++ b/src/chunked_reader.rs @@ -4,10 +4,10 @@ the door to parallel processing. */ use std::io::{Read, Result}; -pub struct ChunkedReader { +pub struct ChunkedReader { source: T, remainder: Vec, - exhausted: bool + exhausted: bool, } impl ChunkedReader { @@ -15,7 +15,7 @@ impl ChunkedReader { Self { source, remainder: Vec::new(), - exhausted: false + exhausted: false, } } @@ -44,7 +44,8 @@ impl ChunkedReader { dest.set_len(target_size); // Restore to full target size while bytes_read_total < target_size { let bytes_read = self.source.read(&mut dest[bytes_read_total..])?; - if bytes_read == 0 { // Assume final read + if bytes_read == 0 { + // Assume final read dest.truncate(bytes_read_total); self.exhausted = true; return Ok(false); // Signal final read @@ -88,7 +89,8 @@ impl ChunkedReader { chunk.set_len(target_size); // Restore to full target size while bytes_read_total < target_size { let bytes_read = self.source.read(&mut chunk[bytes_read_total..])?; - if bytes_read == 0 { // Assume final read + if bytes_read == 0 { + // Assume final read chunk.truncate(bytes_read_total); self.exhausted = true; break; @@ -101,11 +103,12 @@ impl ChunkedReader { // Split at last newline, keep right-hand side for next chunk self.remainder.clear(); if let Some(cutoff) = chunk.rfind('\n') { - self.remainder.extend_from_slice(&chunk.as_bytes()[cutoff..]); + self.remainder + .extend_from_slice(&chunk.as_bytes()[cutoff..]); chunk.truncate(cutoff); } } Ok(Some(chunk)) } -} \ No newline at end of file +} diff --git a/src/cli.rs b/src/cli.rs index c017873..892142f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -4,8 +4,9 @@ Parsing CLI arguments use crate::util::{ExportFormat, PageNs}; use anyhow::Result; -use clap::{Arg, App}; +use clap::{App, Arg}; +use std::convert::TryFrom; use std::path::PathBuf; use std::str::FromStr; @@ -18,12 +19,11 @@ pub struct CliParams { pub namespaces_to: Vec, pub buf_size_mib: usize, pub cutoff_threshold: u32, - pub export_format: ExportFormat + pub export_format: ExportFormat, } pub fn init_cli_app() -> Result { - let matches = - App::new("wikidigest-link-count") + let matches = App::new("wikidigest-link-count") .version("0.1") .author("github.com/adrian5") .about("Find the most linked-to pages in MediaWiki databases") @@ -32,74 +32,80 @@ pub fn init_cli_app() -> Result { 1. The page-table SQL dump (…page.sql.gz)\n\ 2. The redirect-table SQL dump (…redirect.sql.gz)\n\ 3. The pagelinks-table SQL dump (…pagelinks.sql.gz)\n\n\ - For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/" + For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/", ) // Page file - .arg(Arg::with_name("file-page") - .short("p") - .long("page-file") - .value_name("PATH") - .help("Path to ‘…page.sql(.gz)’") - .takes_value(true) - .required(true) + .arg( + Arg::with_name("file-page") + .short("p") + .long("page-file") + .value_name("PATH") + .help("Path to ‘…page.sql(.gz)’") + .takes_value(true) + .required(true), ) // Redirect file - .arg(Arg::with_name("file-redirect") - .short("r") - .long("redirect-file") - .value_name("PATH") - .help("Path to ‘…redirect.sql(.gz)’") - .takes_value(true) - .required(true) + .arg( + Arg::with_name("file-redirect") + .short("r") + .long("redirect-file") + .value_name("PATH") + .help("Path to ‘…redirect.sql(.gz)’") + .takes_value(true) + .required(true), ) // Pagelinks file - .arg(Arg::with_name("file-pagelinks") - .short("l") - .long("pagelinks-file") - .value_name("PATH") - .help("Path to ‘…pagelinks.sql(.gz)’") - .takes_value(true) - .required(true) + .arg( + Arg::with_name("file-pagelinks") + .short("l") + .long("pagelinks-file") + .value_name("PATH") + .help("Path to ‘…pagelinks.sql(.gz)’") + .takes_value(true) + .required(true), ) // Output file - .arg(Arg::with_name("file-output") - .short("o") - .long("output-file") - .value_name("PATH") - .help("Path to write results to") - .default_value("./results") - .takes_value(true) + .arg( + Arg::with_name("file-output") + .short("o") + .long("output-file") + .value_name("PATH") + .help("Path to write results to") + .default_value("./results") + .takes_value(true), ) // Namespaces (From) - .arg(Arg::with_name("namespaces-from") - .short("f") - .long("from-namespaces") - .value_name("ns,ns,…") - .help("Namespace(s) of pages from which links may originate") - .default_value("0") - .takes_value(true) - .use_delimiter(true) + .arg( + Arg::with_name("namespaces-from") + .short("f") + .long("from-namespaces") + .value_name("ns,ns,…") + .help("Namespace(s) of pages from which links may originate") + .default_value("0") + .takes_value(true) + .use_delimiter(true), ) // Namespaces (To) - .arg(Arg::with_name("namespaces-to") - .short("t") - .long("to-namespaces") - .value_name("ns,ns,…") - .help("Namespace(s) of pages to which links may lead") - .default_value("0") - .takes_value(true) - .use_delimiter(true) + .arg( + Arg::with_name("namespaces-to") + .short("t") + .long("to-namespaces") + .value_name("ns,ns,…") + .help("Namespace(s) of pages to which links may lead") + .default_value("0") + .takes_value(true) + .use_delimiter(true), ) // Buffer size - .arg(Arg::with_name("buf-size") - .short("b") - .long("bufsize") - .value_name("MiB") - .help("Buffer size per thread") - .default_value("32") - .takes_value(true) - .validator(|t| { - match t.parse::() { + .arg( + Arg::with_name("buf-size") + .short("b") + .long("bufsize") + .value_name("MiB") + .help("Buffer size per thread") + .default_value("32") + .takes_value(true) + .validator(|bs| match bs.parse::() { Err(_) => Err("must be a positive number".to_string()), Ok(value) => { if value > 8 && value < 1024 { @@ -108,24 +114,40 @@ pub fn init_cli_app() -> Result { Err("must be between 8 and 1024".to_string()) } } - } - }) + }), ) // Cutoff threshold - .arg(Arg::with_name("cutoff-threshold") - .short("c") - .long("cutoff") - .value_name("THRESHOLD") - .help("Output only pages with link-count above threshold") - .default_value("25000") - .takes_value(true) - .validator(|t| { - t.parse::().map(|_| ()).map_err(|_| "must be a positive number".to_string()) - }) + .arg( + Arg::with_name("cutoff-threshold") + .short("c") + .long("cutoff") + .value_name("THRESHOLD") + .help("Output only pages with link-count above threshold") + .default_value("25000") + .takes_value(true) + .validator(|t| { + t.parse::() + .map(|_| ()) + .map_err(|_| "must be a positive number".to_string()) + }), ) // Export format - // TODO - .get_matches(); + .arg( + Arg::with_name("export-format") + .short("e") + .long("export-as") + .value_name("FORMAT") + .help("Format to output results as") + .long_help("Supported formats are: text (plain), wikitext, markdown (gfm)") + .default_value("text") + .takes_value(true) + .validator(|f| { + ExportFormat::try_from(f.as_str()) + .map(|_| ()) + .map_err(|e| e) + }), + ) + .get_matches(); // Conversion let page_file = PathBuf::from_str(matches.value_of("file-page").unwrap())?; @@ -133,14 +155,23 @@ pub fn init_cli_app() -> Result { let pagelinks_file = PathBuf::from_str(matches.value_of("file-pagelinks").unwrap())?; let output_file = PathBuf::from_str(matches.value_of("file-output").unwrap())?; - let namespaces_from = matches.values_of("namespaces-from").unwrap() - .map(|ns| PageNs(ns.parse::().unwrap()) ).collect::>(); - let namespaces_to = matches.values_of("namespaces-to").unwrap() - .map(|ns| PageNs(ns.parse::().unwrap()) ).collect::>(); + let namespaces_from = matches + .values_of("namespaces-from") + .unwrap() + .map(|ns| PageNs(ns.parse::().unwrap())) + .collect::>(); + let namespaces_to = matches + .values_of("namespaces-to") + .unwrap() + .map(|ns| PageNs(ns.parse::().unwrap())) + .collect::>(); let buf_size_mib = matches.value_of("buf-size").unwrap().parse::()?; - let cutoff_threshold = matches.value_of("cutoff-threshold").unwrap().parse::()?; - // let export_format = TODO + let cutoff_threshold = matches + .value_of("cutoff-threshold") + .unwrap() + .parse::()?; + let export_format = ExportFormat::try_from(matches.value_of("export-format").unwrap()).unwrap(); let cli_params = CliParams { page_file, @@ -151,8 +182,8 @@ pub fn init_cli_app() -> Result { cutoff_threshold, namespaces_from, namespaces_to, - export_format: ExportFormat::PlainText, + export_format, }; Ok(cli_params) -} \ No newline at end of file +} diff --git a/src/link_count.rs b/src/link_count.rs index d87b164..d0b4b77 100644 --- a/src/link_count.rs +++ b/src/link_count.rs @@ -7,7 +7,7 @@ use std::ops::AddAssign; #[derive(Clone, Copy, Debug, Default)] pub struct LinkCount { pub direct: u32, - pub indirect: u32 + pub indirect: u32, } impl LinkCount { @@ -24,7 +24,7 @@ impl AddAssign for LinkCount { fn add_assign(&mut self, other: LinkCount) { *self = Self { direct: self.direct + other.direct, - indirect: self.indirect + other.indirect + indirect: self.indirect + other.indirect, }; } } @@ -47,4 +47,4 @@ impl PartialEq for LinkCount { fn eq(&self, other: &LinkCount) -> bool { (self.direct + self.indirect) == (other.direct + other.indirect) } -} \ No newline at end of file +} diff --git a/src/main.rs b/src/main.rs index 28ad1ee..f243479 100644 --- a/src/main.rs +++ b/src/main.rs @@ -29,8 +29,8 @@ fn main() -> Result<()> { // Process page-table data let pages = { - let f = File::open(&cli.page_file).with_context( - || format!("Failed to open page file ‘{}’", &cli.page_file.display()))?; + let f = File::open(&cli.page_file) + .with_context(|| format!("Failed to open page file ‘{}’", &cli.page_file.display()))?; if util::is_probably_gzip(&cli.page_file) { page_table::collect_pages(GzDecoder::new(f), &cli.namespaces_to, buf_size) } else { @@ -40,8 +40,12 @@ fn main() -> Result<()> { // Process redirect-table data let redirects = { - let f = File::open(&cli.redirect_file).with_context( - || format!("Failed to open redirect file ‘{}’", &cli.redirect_file.display()))?; + let f = File::open(&cli.redirect_file).with_context(|| { + format!( + "Failed to open redirect file ‘{}’", + &cli.redirect_file.display() + ) + })?; if util::is_probably_gzip(&cli.redirect_file) { redirect_table::map_redirects(GzDecoder::new(f), pages, &cli.namespaces_to, buf_size) } else { @@ -51,14 +55,26 @@ fn main() -> Result<()> { // Process pagelinks-table data let pagelinks = { - let f = File::open(&cli.pagelinks_file).with_context( - || format!("Failed to open pagelinks file ‘{}’", &cli.pagelinks_file.display()))?; + let f = File::open(&cli.pagelinks_file).with_context(|| { + format!( + "Failed to open pagelinks file ‘{}’", + &cli.pagelinks_file.display() + ) + })?; if util::is_probably_gzip(&cli.pagelinks_file) { - pagelinks_table::count_links(GzDecoder::new(f), redirects, - (&cli.namespaces_from, &cli.namespaces_to), buf_size) + pagelinks_table::count_links( + GzDecoder::new(f), + redirects, + (&cli.namespaces_from, &cli.namespaces_to), + buf_size, + ) } else { - pagelinks_table::count_links(f, redirects, (&cli.namespaces_from, &cli.namespaces_to), - buf_size) + pagelinks_table::count_links( + f, + redirects, + (&cli.namespaces_from, &cli.namespaces_to), + buf_size, + ) } }?; diff --git a/src/page_table.rs b/src/page_table.rs index 070bea0..f142a62 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -5,7 +5,7 @@ use crate::{ buffer_queue::BufferQueue, chunked_reader::ChunkedReader, progress_display::ProgressDisplay, - util::{self, PageId, PageNs, PageTitle} + util::{self, PageId, PageNs, PageTitle}, }; use ahash::AHashMap; @@ -15,8 +15,14 @@ use regex::Regex; use std::io::Read; use std::sync::Mutex; -pub fn collect_pages(source: T, namespaces: &[PageNs], buffer_size: usize) - -> Result> where T: Read + Send { +pub fn collect_pages( + source: T, + namespaces: &[PageNs], + buffer_size: usize, +) -> Result> +where + T: Read + Send, +{ let pages: Mutex> = Mutex::new(AHashMap::new()); let mut source = ChunkedReader::new(source); @@ -30,7 +36,10 @@ pub fn collect_pages(source: T, namespaces: &[PageNs], buffer_size: usize) let regex = ®ex; loop { - eprint!("\r1/5 Extracting ‘page’ table data ({:.1} GiB processed)", progress.next()); + eprint!( + "\r1/5 Extracting ‘page’ table data ({:.1} GiB processed)", + progress.next() + ); let buffer = buffers.pop(); let was_final_read = !source.read_into(&mut buffer.borrow(), buffer_size)?; @@ -82,7 +91,9 @@ for the sake of completeness. Cost seems negligible. fn build_page_regex(namespaces: &[PageNs]) -> Result { let ns_str = util::namespaces_to_string(namespaces); - let pattern = format!(r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','[a-z,:=]*?',1,", ns_str); + let pattern = format!( + r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','[a-z,:=]*?',1,", + ns_str + ); Regex::new(&pattern).context("Building page regex") } - diff --git a/src/pagelinks_table.rs b/src/pagelinks_table.rs index 5d2e8f3..d767690 100644 --- a/src/pagelinks_table.rs +++ b/src/pagelinks_table.rs @@ -6,7 +6,7 @@ use crate::{ chunked_reader::ChunkedReader, link_count::LinkCount, progress_display::ProgressDisplay, - util::{self, PageNs, PageTitle} + util::{self, PageNs, PageTitle}, }; use ahash::AHashMap; @@ -19,12 +19,18 @@ use std::sync::Mutex; #[derive(Debug, Default)] pub struct Counter { pub direct: u32, - pub indirect: u32 + pub indirect: u32, } -pub fn count_links(source: T, redirects: AHashMap<(PageNs, PageTitle), PageTitle>, - namespaces: (&[PageNs], &[PageNs]), buffer_size: usize) - -> Result> where T: Read + Send { +pub fn count_links( + source: T, + redirects: AHashMap<(PageNs, PageTitle), PageTitle>, + namespaces: (&[PageNs], &[PageNs]), + buffer_size: usize, +) -> Result> +where + T: Read + Send, +{ let pagelinks: Mutex> = Mutex::new(AHashMap::new()); let mut source = ChunkedReader::new(source); @@ -39,13 +45,17 @@ pub fn count_links(source: T, redirects: AHashMap<(PageNs, PageTitle), PageTi let regex = ®ex; loop { - eprint!("\r3/5 Extracting ‘pagelinks’ table data and counting links \ - ({:.1} GiB processed)", progress.next()); + eprint!( + "\r3/5 Extracting ‘pagelinks’ table data and counting links \ + ({:.1} GiB processed)", + progress.next() + ); let buffer = buffers.pop(); let was_final_read = !source.read_into(&mut buffer.borrow(), buffer_size)?; s.spawn_fifo(move |_| { let mut new_pagelinks = AHashMap::<(PageNs, PageTitle), LinkCount>::new(); + for cap in regex.captures_iter(&buffer.borrow()) { let ns = PageNs(cap[1].parse::().unwrap()); let title = &cap[2]; @@ -60,8 +70,8 @@ pub fn count_links(source: T, redirects: AHashMap<(PageNs, PageTitle), PageTi if let Some(link_count) = pl_query(&mut new_pagelinks, ns, title) { link_count.direct += 1; } else { - new_pagelinks.insert((ns, PageTitle(title.to_string())), - LinkCount::new(1, 0)); + new_pagelinks + .insert((ns, PageTitle(title.to_string())), LinkCount::new(1, 0)); } } } @@ -105,13 +115,14 @@ but it offers some protection against fauly matches in the case of erroneous dat ,(?:{}), : match the ‘pl_from_namespace’ on any of the numbers passed via the first namespaces function parameter. */ -fn build_pagelinks_regex(namespaces_from: &[PageNs], namespaces_to: &[PageNs]) - -> Result { +fn build_pagelinks_regex(namespaces_from: &[PageNs], namespaces_to: &[PageNs]) -> Result { let ns_from_str = util::namespaces_to_string(namespaces_from); let ns_to_str = util::namespaces_to_string(namespaces_to); - let pattern = format!(r"\(\d+,({}),'((?:[^']|\\'){{1,255}}?)',(?:{})\)", - ns_to_str, ns_from_str); + let pattern = format!( + r"\(\d+,({}),'((?:[^']|\\'){{1,255}}?)',(?:{})\)", + ns_to_str, ns_from_str + ); Regex::new(&pattern).context("Building pagelinks regex") } @@ -121,23 +132,43 @@ the lifetime of a query parameter is required to encompass that of the HashMap. temporary &str from the regex iterator, they would require cloning into String for each query. */ #[inline] -fn rd_query<'a>(rd: &'a AHashMap<(PageNs, PageTitle), PageTitle>, ns: PageNs, title: &str) - -> Option<&'a PageTitle> { - unsafe { // Satisfy the &(PageNs, String) interface without new allocations - let key = (ns, PageTitle(String::from_raw_parts(title.as_ptr() as *mut u8, - title.len(), title.len()))); +fn rd_query<'a>( + rd: &'a AHashMap<(PageNs, PageTitle), PageTitle>, + ns: PageNs, + title: &str, +) -> Option<&'a PageTitle> { + unsafe { + // Satisfy the &(PageNs, String) interface without reallocations + let key = ( + ns, + PageTitle(String::from_raw_parts( + title.as_ptr() as *mut u8, + title.len(), + title.len(), + )), + ); let key = std::mem::ManuallyDrop::new(key); rd.get(&*key) } } #[inline] -fn pl_query<'a>(pl: &'a mut AHashMap<(PageNs, PageTitle), LinkCount>, ns: PageNs, title: &str) --> Option<&'a mut LinkCount> { - unsafe { // As above - let key = (ns, PageTitle(String::from_raw_parts(title.as_ptr() as *mut u8, - title.len(), title.len()))); +fn pl_query<'a>( + pl: &'a mut AHashMap<(PageNs, PageTitle), LinkCount>, + ns: PageNs, + title: &str, +) -> Option<&'a mut LinkCount> { + unsafe { + // As above + let key = ( + ns, + PageTitle(String::from_raw_parts( + title.as_ptr() as *mut u8, + title.len(), + title.len(), + )), + ); let key = std::mem::ManuallyDrop::new(key); pl.get_mut(&*key) } -} \ No newline at end of file +} diff --git a/src/progress_display.rs b/src/progress_display.rs index ea6da1c..f6e91cc 100644 --- a/src/progress_display.rs +++ b/src/progress_display.rs @@ -14,7 +14,7 @@ impl ProgressDisplay { ProgressDisplay { progress_counter: 0, - buffer_size_gib + buffer_size_gib, } } @@ -23,4 +23,4 @@ impl ProgressDisplay { self.progress_counter += 1; out } -} \ No newline at end of file +} diff --git a/src/redirect_table.rs b/src/redirect_table.rs index 708bfba..8e5dd47 100644 --- a/src/redirect_table.rs +++ b/src/redirect_table.rs @@ -5,7 +5,7 @@ use crate::{ buffer_queue::BufferQueue, chunked_reader::ChunkedReader, progress_display::ProgressDisplay, - util::{self, PageId, PageNs, PageTitle} + util::{self, PageId, PageNs, PageTitle}, }; use ahash::AHashMap; @@ -15,9 +15,15 @@ use regex::Regex; use std::io::Read; use std::sync::Mutex; -pub fn map_redirects(source: T, pages: AHashMap<(PageNs, PageId), PageTitle>, - namespaces: &[PageNs], buffer_size: usize) - -> Result> where T: Read + Send { +pub fn map_redirects( + source: T, + pages: AHashMap<(PageNs, PageId), PageTitle>, + namespaces: &[PageNs], + buffer_size: usize, +) -> Result> +where + T: Read + Send, +{ let redirects: Mutex> = Mutex::new(AHashMap::new()); let mut source = ChunkedReader::new(source); @@ -32,21 +38,27 @@ pub fn map_redirects(source: T, pages: AHashMap<(PageNs, PageId), PageTitle>, let regex = ®ex; loop { - eprint!("\r2/5 Extracting ‘redirect’ table data and mapping relations \ - ({:.1} GiB processed)", progress.next()); + eprint!( + "\r2/5 Extracting ‘redirect’ table data and mapping relations \ + ({:.1} GiB processed)", + progress.next() + ); let buffer = buffers.pop(); let was_final_read = !source.read_into(&mut buffer.borrow(), buffer_size)?; s.spawn_fifo(move |_| { let mut new_redirects: Vec<((PageNs, PageTitle), PageTitle)> = Vec::new(); + for cap in regex.captures_iter(&buffer.borrow()) { let source_id = PageId(cap[1].parse::().unwrap()); let source_ns = PageNs(cap[2].parse::().unwrap()); let target_title = &cap[3]; if let Some(source_title) = pages.get(&(source_ns, source_id)) { - new_redirects.push(((source_ns, source_title.clone()), - PageTitle(target_title.to_string()))); + new_redirects.push(( + (source_ns, source_title.clone()), + PageTitle(target_title.to_string()), + )); } } @@ -88,7 +100,9 @@ to be empty. fn build_redirect_regex(namespaces: &[PageNs]) -> Result { let ns_str = util::namespaces_to_string(namespaces); - let pattern = format!(r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','','(?:[^']|\\'){{0,255}}?'\)", - ns_str); + let pattern = format!( + r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','','(?:[^']|\\'){{0,255}}?'\)", + ns_str + ); Regex::new(&pattern).context("Building redirect pattern") } diff --git a/src/util.rs b/src/util.rs index 0c59e96..1bb9af7 100644 --- a/src/util.rs +++ b/src/util.rs @@ -6,6 +6,7 @@ use crate::link_count::LinkCount; use ahash::AHashMap; use anyhow::Result; +use std::convert::TryFrom; use std::fmt; use std::fs::File; use std::io::Write; @@ -34,7 +35,22 @@ impl fmt::Display for PageTitle { #[derive(Clone, Copy)] pub enum ExportFormat { - PlainText + PlainText, + WikiText, + Markdown +} + +impl TryFrom<&str> for ExportFormat { + type Error = String; + + fn try_from(format: &str) -> Result { + match format { + "text" => Ok(Self::PlainText), + "wiki" => Ok(Self::WikiText), + "markdown" => Ok(Self::Markdown), + _ => Err(format!("Cannot convert ‘{}’ into ExportFormat", format)), + } + } } pub fn is_probably_gzip(path: &Path) -> bool { @@ -50,12 +66,21 @@ pub fn build_output_filename(path: &Path, export_format: ExportFormat) -> PathBu use ExportFormat::*; let mut filename = path.to_path_buf(); - if path.extension().is_some() { // User supplied extension - return filename + if path.extension().is_some() { + // User supplied extension + return filename; } match export_format { - PlainText => { filename.set_extension("txt"); } + PlainText => { + filename.set_extension("txt"); + }, + WikiText => { + filename.set_extension("txt"); + }, + Markdown => { + filename.set_extension("md"); + }, } filename @@ -93,9 +118,12 @@ fn underscores_to_spaces(mut s: String) -> String { s } -pub fn sort_pagelinks(pagelinks: AHashMap<(PageNs, PageTitle), LinkCount>, cutoff: u32) - -> Vec<((PageNs, PageTitle), LinkCount)> { - let mut output: Vec<((PageNs, PageTitle), LinkCount)> = pagelinks.into_iter() +pub fn sort_pagelinks( + pagelinks: AHashMap<(PageNs, PageTitle), LinkCount>, + cutoff: u32, +) -> Vec<((PageNs, PageTitle), LinkCount)> { + let mut output: Vec<((PageNs, PageTitle), LinkCount)> = pagelinks + .into_iter() .filter(|pl| pl.1.total() >= cutoff) .collect(); @@ -103,23 +131,88 @@ pub fn sort_pagelinks(pagelinks: AHashMap<(PageNs, PageTitle), LinkCount>, cutof output } -pub fn export_to_file(pages: Vec<((PageNs, PageTitle), LinkCount)>, mut file: File, - format: ExportFormat) -> Result<()> { +pub fn export_to_file( + pages: Vec<((PageNs, PageTitle), LinkCount)>, + mut file: File, + format: ExportFormat, +) -> Result<()> { use ExportFormat::*; match format { PlainText => write_plaintext(&mut file, pages)?, + WikiText => write_wikitext(&mut file, pages)?, + Markdown => write_markdown(&mut file, pages)?, } Ok(()) } fn write_plaintext(file: &mut File, pages: Vec<((PageNs, PageTitle), LinkCount)>) -> Result<()> { - writeln!(file, "page title [namespace] → links-total (direct + indirect)\n")?; + writeln!( + file, + "page title [namespace] → links-total (direct + indirect)\n" + )?; for p in pages { let title = underscores_to_spaces(((p.0).1).0); - writeln!(file, "{} [{}] → {} ({} + {})", - title, (p.0).0, p.1.total(), p.1.direct, p.1.indirect)?; + writeln!( + file, + "{} [{}] → {} ({} + {})", + title, + (p.0).0, + p.1.total(), + p.1.direct, + p.1.indirect + )?; } Ok(()) -} \ No newline at end of file +} + +fn write_wikitext(file: &mut File, pages: Vec<((PageNs, PageTitle), LinkCount)>) -> Result<()> { + writeln!(file, "{{|class=\"wikitable sortable\"")?; + writeln!( + file, + "! Page !! Ns !! Links total !! Direct !! via redirect\n|-" + )?; + + for p in pages { + let title = underscores_to_spaces(((p.0).1).0); + writeln!( + file, + "| [[{}]] || {} || {} || {} || {}\n|-", + title, + (p.0).0, + p.1.total(), + p.1.direct, + p.1.indirect + )?; + } + + writeln!(file, "|}}")?; + Ok(()) +} + +fn write_markdown(file: &mut File, pages: Vec<((PageNs, PageTitle), LinkCount)>) -> Result<()> { + // NOTE: Markdown tables are non-standard (GitHub Flavored Markdown); This function also + // doesn't pretty-print the table, which would require significantly more logic. + writeln!( + file, + "Page | Ns | Links total | Direct | via redirect\n\ + :--- | :---: | ---: | ---: | ---:" + )?; + + for p in pages { + let title = underscores_to_spaces(((p.0).1).0); + writeln!( + file, + "{} | {} | {} | {} | {}", + title, + (p.0).0, + p.1.total(), + p.1.direct, + p.1.indirect + )?; + } + + Ok(()) +} +