Add WikiText and Markdown export formats; Apply rustfmt to entire source

adrian5 · Jun 23, 2020 · 3999c68 · 3999c68
1 parent be8ff76
commit 3999c68
Show file tree

Hide file tree

Showing 12 changed files with 368 additions and 165 deletions.
diff --git a/README.md b/README.md
@@ -28,9 +28,9 @@ The program requires three input files to operate:
 
 For the English Wikipedia, you can get these at <https://dumps.wikimedia.org/enwiki/> as:
 
-* enwiki-yyyy-mm-dd-page.sql.gz
-* enwiki-yyyy-mm-dd-redirect.sql.gz
-* enwiki-yyyy-mm-dd-pagelinks.sql.gz
+* enwiki-yyyymmdd-page.sql.gz
+* enwiki-yyyymmdd-redirect.sql.gz
+* enwiki-yyyymmdd-pagelinks.sql.gz
 
 ### Hardware
 
@@ -62,9 +62,15 @@ Custom (128 MiB) buffer size and a link-count cutoff of 185K, below which pages
 wikidigest-link-count -p page.sql.gz -r redirect.sql.gz -l pagelinks.sql.gz -o /tmp/185k-or-more -b 128 -c 185000
 ```
 
+Export as different format ([WikiText](https://en.wikipedia.org/wiki/Help:Wikitext) table):
+
+```
+wikidigest-link-count -p page.sql -r redirect.sql -l pagelinks.sql -e wikitext
+```
+
 ## Results
 
-Results are written to a plain text file.
+Results are written to an output file, by default as Plaintext to `./results.txt`.
 
 Below results for the English Wikipedia, Apr 2020 – pages with 200K or more incoming links
 within the main (0) namespace:

diff --git a/notes.md b/notes.md
@@ -24,6 +24,4 @@
   - XML
   - JSON
   - HTML (table)
-  - Markdown (table)
-  - Wiki (table)
 
diff --git a/src/buffer_queue.rs b/src/buffer_queue.rs
@@ -10,7 +10,7 @@ const QUERY_INTERVAL_MS: u64 = 250;
 
 pub struct BufferQueue {
     buffers: Vec<Mutex<String>>,
-    queue: Mutex<VecDeque<usize>>
+    queue: Mutex<VecDeque<usize>>,
 }
 
 impl BufferQueue {
@@ -25,7 +25,7 @@ impl BufferQueue {
 
         Self {
             buffers,
-            queue: Mutex::new(queue)
+            queue: Mutex::new(queue),
         }
     }
 
@@ -34,7 +34,7 @@ impl BufferQueue {
         Buffer {
             id,
             inner: &self.buffers[id],
-            queue: &self.queue
+            queue: &self.queue,
         }
     }
 
@@ -57,7 +57,7 @@ Once released, it pushes its ID back into the shared queue, allowing the resourc
 pub struct Buffer<'a> {
     id: usize,
     inner: &'a Mutex<String>,
-    queue: &'a Mutex<VecDeque<usize>>
+    queue: &'a Mutex<VecDeque<usize>>,
 }
 
 impl<'a> Buffer<'a> {

diff --git a/src/chunked_reader.rs b/src/chunked_reader.rs
@@ -4,18 +4,18 @@ the door to parallel processing.
 */
 use std::io::{Read, Result};
 
-pub struct ChunkedReader<T> {
+pub struct ChunkedReader<T: Read> {
     source: T,
     remainder: Vec<u8>,
-    exhausted: bool
+    exhausted: bool,
 }
 
 impl<T: Read> ChunkedReader<T> {
     pub fn new(source: T) -> Self {
         Self {
             source,
             remainder: Vec::new(),
-            exhausted: false
+            exhausted: false,
         }
     }
 
@@ -44,7 +44,8 @@ impl<T: Read> ChunkedReader<T> {
             dest.set_len(target_size); // Restore to full target size
             while bytes_read_total < target_size {
                 let bytes_read = self.source.read(&mut dest[bytes_read_total..])?;
-                if bytes_read == 0 { // Assume final read
+                if bytes_read == 0 {
+                    // Assume final read
                     dest.truncate(bytes_read_total);
                     self.exhausted = true;
                     return Ok(false); // Signal final read
@@ -88,7 +89,8 @@ impl<T: Read> ChunkedReader<T> {
             chunk.set_len(target_size); // Restore to full target size
             while bytes_read_total < target_size {
                 let bytes_read = self.source.read(&mut chunk[bytes_read_total..])?;
-                if bytes_read == 0 { // Assume final read
+                if bytes_read == 0 {
+                    // Assume final read
                     chunk.truncate(bytes_read_total);
                     self.exhausted = true;
                     break;
@@ -101,11 +103,12 @@ impl<T: Read> ChunkedReader<T> {
             // Split at last newline, keep right-hand side for next chunk
             self.remainder.clear();
             if let Some(cutoff) = chunk.rfind('\n') {
-                self.remainder.extend_from_slice(&chunk.as_bytes()[cutoff..]);
+                self.remainder
+                    .extend_from_slice(&chunk.as_bytes()[cutoff..]);
                 chunk.truncate(cutoff);
             }
         }
 
         Ok(Some(chunk))
     }
-}
+}
diff --git a/src/cli.rs b/src/cli.rs
@@ -4,8 +4,9 @@ Parsing CLI arguments
 use crate::util::{ExportFormat, PageNs};
 
 use anyhow::Result;
-use clap::{Arg, App};
+use clap::{App, Arg};
 
+use std::convert::TryFrom;
 use std::path::PathBuf;
 use std::str::FromStr;
 
@@ -18,12 +19,11 @@ pub struct CliParams {
     pub namespaces_to: Vec<PageNs>,
     pub buf_size_mib: usize,
     pub cutoff_threshold: u32,
-    pub export_format: ExportFormat
+    pub export_format: ExportFormat,
 }
 
 pub fn init_cli_app() -> Result<CliParams> {
-    let matches =
-        App::new("wikidigest-link-count")
+    let matches = App::new("wikidigest-link-count")
         .version("0.1")
         .author("github.com/adrian5")
         .about("Find the most linked-to pages in MediaWiki databases")
@@ -32,74 +32,80 @@ pub fn init_cli_app() -> Result<CliParams> {
             1. The page-table SQL dump (…page.sql.gz)\n\
             2. The redirect-table SQL dump (…redirect.sql.gz)\n\
             3. The pagelinks-table SQL dump (…pagelinks.sql.gz)\n\n\
-            For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/"
+            For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/",
         )
         // Page file
-        .arg(Arg::with_name("file-page")
-            .short("p")
-            .long("page-file")
-            .value_name("PATH")
-            .help("Path to ‘…page.sql(.gz)’")
-            .takes_value(true)
-            .required(true)
+        .arg(
+            Arg::with_name("file-page")
+                .short("p")
+                .long("page-file")
+                .value_name("PATH")
+                .help("Path to ‘…page.sql(.gz)’")
+                .takes_value(true)
+                .required(true),
         )
         // Redirect file
-        .arg(Arg::with_name("file-redirect")
-            .short("r")
-            .long("redirect-file")
-            .value_name("PATH")
-            .help("Path to ‘…redirect.sql(.gz)’")
-            .takes_value(true)
-            .required(true)
+        .arg(
+            Arg::with_name("file-redirect")
+                .short("r")
+                .long("redirect-file")
+                .value_name("PATH")
+                .help("Path to ‘…redirect.sql(.gz)’")
+                .takes_value(true)
+                .required(true),
         )
         // Pagelinks file
-        .arg(Arg::with_name("file-pagelinks")
-            .short("l")
-            .long("pagelinks-file")
-            .value_name("PATH")
-            .help("Path to ‘…pagelinks.sql(.gz)’")
-            .takes_value(true)
-            .required(true)
+        .arg(
+            Arg::with_name("file-pagelinks")
+                .short("l")
+                .long("pagelinks-file")
+                .value_name("PATH")
+                .help("Path to ‘…pagelinks.sql(.gz)’")
+                .takes_value(true)
+                .required(true),
         )
         // Output file
-        .arg(Arg::with_name("file-output")
-            .short("o")
-            .long("output-file")
-            .value_name("PATH")
-            .help("Path to write results to")
-            .default_value("./results")
-            .takes_value(true)
+        .arg(
+            Arg::with_name("file-output")
+                .short("o")
+                .long("output-file")
+                .value_name("PATH")
+                .help("Path to write results to")
+                .default_value("./results")
+                .takes_value(true),
         )
         // Namespaces (From)
-        .arg(Arg::with_name("namespaces-from")
-            .short("f")
-            .long("from-namespaces")
-            .value_name("ns,ns,…")
-            .help("Namespace(s) of pages from which links may originate")
-            .default_value("0")
-            .takes_value(true)
-            .use_delimiter(true)
+        .arg(
+            Arg::with_name("namespaces-from")
+                .short("f")
+                .long("from-namespaces")
+                .value_name("ns,ns,…")
+                .help("Namespace(s) of pages from which links may originate")
+                .default_value("0")
+                .takes_value(true)
+                .use_delimiter(true),
         )
         // Namespaces (To)
-        .arg(Arg::with_name("namespaces-to")
-            .short("t")
-            .long("to-namespaces")
-            .value_name("ns,ns,…")
-            .help("Namespace(s) of pages to which links may lead")
-            .default_value("0")
-            .takes_value(true)
-            .use_delimiter(true)
+        .arg(
+            Arg::with_name("namespaces-to")
+                .short("t")
+                .long("to-namespaces")
+                .value_name("ns,ns,…")
+                .help("Namespace(s) of pages to which links may lead")
+                .default_value("0")
+                .takes_value(true)
+                .use_delimiter(true),
         )
         // Buffer size
-        .arg(Arg::with_name("buf-size")
-            .short("b")
-            .long("bufsize")
-            .value_name("MiB")
-            .help("Buffer size per thread")
-            .default_value("32")
-            .takes_value(true)
-            .validator(|t| {
-                match t.parse::<u32>() {
+        .arg(
+            Arg::with_name("buf-size")
+                .short("b")
+                .long("bufsize")
+                .value_name("MiB")
+                .help("Buffer size per thread")
+                .default_value("32")
+                .takes_value(true)
+                .validator(|bs| match bs.parse::<u32>() {
                     Err(_) => Err("must be a positive number".to_string()),
                     Ok(value) => {
                         if value > 8 && value < 1024 {
@@ -108,39 +114,64 @@ pub fn init_cli_app() -> Result<CliParams> {
                             Err("must be between 8 and 1024".to_string())
                         }
                     }
-                }
-            })
+                }),
         )
         // Cutoff threshold
-        .arg(Arg::with_name("cutoff-threshold")
-            .short("c")
-            .long("cutoff")
-            .value_name("THRESHOLD")
-            .help("Output only pages with link-count above threshold")
-            .default_value("25000")
-            .takes_value(true)
-            .validator(|t| {
-                t.parse::<u32>().map(|_| ()).map_err(|_| "must be a positive number".to_string())
-            })
+        .arg(
+            Arg::with_name("cutoff-threshold")
+                .short("c")
+                .long("cutoff")
+                .value_name("THRESHOLD")
+                .help("Output only pages with link-count above threshold")
+                .default_value("25000")
+                .takes_value(true)
+                .validator(|t| {
+                    t.parse::<u32>()
+                        .map(|_| ())
+                        .map_err(|_| "must be a positive number".to_string())
+                }),
         )
         // Export format
-        // TODO
-    .get_matches();
+        .arg(
+            Arg::with_name("export-format")
+                .short("e")
+                .long("export-as")
+                .value_name("FORMAT")
+                .help("Format to output results as")
+                .long_help("Supported formats are: text (plain), wikitext, markdown (gfm)")
+                .default_value("text")
+                .takes_value(true)
+                .validator(|f| {
+                    ExportFormat::try_from(f.as_str())
+                        .map(|_| ())
+                        .map_err(|e| e)
+                }),
+        )
+        .get_matches();
 
     // Conversion
     let page_file = PathBuf::from_str(matches.value_of("file-page").unwrap())?;
     let redirect_file = PathBuf::from_str(matches.value_of("file-redirect").unwrap())?;
     let pagelinks_file = PathBuf::from_str(matches.value_of("file-pagelinks").unwrap())?;
     let output_file = PathBuf::from_str(matches.value_of("file-output").unwrap())?;
 
-    let namespaces_from = matches.values_of("namespaces-from").unwrap()
-        .map(|ns| PageNs(ns.parse::<u32>().unwrap()) ).collect::<Vec<PageNs>>();
-    let namespaces_to = matches.values_of("namespaces-to").unwrap()
-        .map(|ns| PageNs(ns.parse::<u32>().unwrap()) ).collect::<Vec<PageNs>>();
+    let namespaces_from = matches
+        .values_of("namespaces-from")
+        .unwrap()
+        .map(|ns| PageNs(ns.parse::<u32>().unwrap()))
+        .collect::<Vec<PageNs>>();
+    let namespaces_to = matches
+        .values_of("namespaces-to")
+        .unwrap()
+        .map(|ns| PageNs(ns.parse::<u32>().unwrap()))
+        .collect::<Vec<PageNs>>();
 
     let buf_size_mib = matches.value_of("buf-size").unwrap().parse::<usize>()?;
-    let cutoff_threshold = matches.value_of("cutoff-threshold").unwrap().parse::<u32>()?;
-    // let export_format = TODO
+    let cutoff_threshold = matches
+        .value_of("cutoff-threshold")
+        .unwrap()
+        .parse::<u32>()?;
+    let export_format = ExportFormat::try_from(matches.value_of("export-format").unwrap()).unwrap();
 
     let cli_params = CliParams {
         page_file,
@@ -151,8 +182,8 @@ pub fn init_cli_app() -> Result<CliParams> {
         cutoff_threshold,
         namespaces_from,
         namespaces_to,
-        export_format: ExportFormat::PlainText,
+        export_format,
     };
 
     Ok(cli_params)
-}
+}