diff --git a/README.md b/README.md
index 3c421a7..1dca7bc 100644
--- a/README.md
+++ b/README.md
@@ -28,9 +28,9 @@ The program requires three input files to operate:
For the English Wikipedia, you can get these at as:
-* enwiki-yyyy-mm-dd-page.sql.gz
-* enwiki-yyyy-mm-dd-redirect.sql.gz
-* enwiki-yyyy-mm-dd-pagelinks.sql.gz
+* enwiki-yyyymmdd-page.sql.gz
+* enwiki-yyyymmdd-redirect.sql.gz
+* enwiki-yyyymmdd-pagelinks.sql.gz
### Hardware
@@ -62,9 +62,15 @@ Custom (128 MiB) buffer size and a link-count cutoff of 185K, below which pages
wikidigest-link-count -p page.sql.gz -r redirect.sql.gz -l pagelinks.sql.gz -o /tmp/185k-or-more -b 128 -c 185000
```
+Export as different format ([WikiText](https://en.wikipedia.org/wiki/Help:Wikitext) table):
+
+```
+wikidigest-link-count -p page.sql -r redirect.sql -l pagelinks.sql -e wikitext
+```
+
## Results
-Results are written to a plain text file.
+Results are written to an output file, by default as Plaintext to `./results.txt`.
Below results for the English Wikipedia, Apr 2020 – pages with 200K or more incoming links
within the main (0) namespace:
diff --git a/notes.md b/notes.md
index ad91145..0f8788f 100644
--- a/notes.md
+++ b/notes.md
@@ -24,6 +24,4 @@
- XML
- JSON
- HTML (table)
- - Markdown (table)
- - Wiki (table)
diff --git a/src/buffer_queue.rs b/src/buffer_queue.rs
index 2d62dfe..78a0134 100644
--- a/src/buffer_queue.rs
+++ b/src/buffer_queue.rs
@@ -10,7 +10,7 @@ const QUERY_INTERVAL_MS: u64 = 250;
pub struct BufferQueue {
buffers: Vec>,
- queue: Mutex>
+ queue: Mutex>,
}
impl BufferQueue {
@@ -25,7 +25,7 @@ impl BufferQueue {
Self {
buffers,
- queue: Mutex::new(queue)
+ queue: Mutex::new(queue),
}
}
@@ -34,7 +34,7 @@ impl BufferQueue {
Buffer {
id,
inner: &self.buffers[id],
- queue: &self.queue
+ queue: &self.queue,
}
}
@@ -57,7 +57,7 @@ Once released, it pushes its ID back into the shared queue, allowing the resourc
pub struct Buffer<'a> {
id: usize,
inner: &'a Mutex,
- queue: &'a Mutex>
+ queue: &'a Mutex>,
}
impl<'a> Buffer<'a> {
diff --git a/src/chunked_reader.rs b/src/chunked_reader.rs
index 879f917..2c1c220 100644
--- a/src/chunked_reader.rs
+++ b/src/chunked_reader.rs
@@ -4,10 +4,10 @@ the door to parallel processing.
*/
use std::io::{Read, Result};
-pub struct ChunkedReader {
+pub struct ChunkedReader {
source: T,
remainder: Vec,
- exhausted: bool
+ exhausted: bool,
}
impl ChunkedReader {
@@ -15,7 +15,7 @@ impl ChunkedReader {
Self {
source,
remainder: Vec::new(),
- exhausted: false
+ exhausted: false,
}
}
@@ -44,7 +44,8 @@ impl ChunkedReader {
dest.set_len(target_size); // Restore to full target size
while bytes_read_total < target_size {
let bytes_read = self.source.read(&mut dest[bytes_read_total..])?;
- if bytes_read == 0 { // Assume final read
+ if bytes_read == 0 {
+ // Assume final read
dest.truncate(bytes_read_total);
self.exhausted = true;
return Ok(false); // Signal final read
@@ -88,7 +89,8 @@ impl ChunkedReader {
chunk.set_len(target_size); // Restore to full target size
while bytes_read_total < target_size {
let bytes_read = self.source.read(&mut chunk[bytes_read_total..])?;
- if bytes_read == 0 { // Assume final read
+ if bytes_read == 0 {
+ // Assume final read
chunk.truncate(bytes_read_total);
self.exhausted = true;
break;
@@ -101,11 +103,12 @@ impl ChunkedReader {
// Split at last newline, keep right-hand side for next chunk
self.remainder.clear();
if let Some(cutoff) = chunk.rfind('\n') {
- self.remainder.extend_from_slice(&chunk.as_bytes()[cutoff..]);
+ self.remainder
+ .extend_from_slice(&chunk.as_bytes()[cutoff..]);
chunk.truncate(cutoff);
}
}
Ok(Some(chunk))
}
-}
\ No newline at end of file
+}
diff --git a/src/cli.rs b/src/cli.rs
index c017873..892142f 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -4,8 +4,9 @@ Parsing CLI arguments
use crate::util::{ExportFormat, PageNs};
use anyhow::Result;
-use clap::{Arg, App};
+use clap::{App, Arg};
+use std::convert::TryFrom;
use std::path::PathBuf;
use std::str::FromStr;
@@ -18,12 +19,11 @@ pub struct CliParams {
pub namespaces_to: Vec,
pub buf_size_mib: usize,
pub cutoff_threshold: u32,
- pub export_format: ExportFormat
+ pub export_format: ExportFormat,
}
pub fn init_cli_app() -> Result {
- let matches =
- App::new("wikidigest-link-count")
+ let matches = App::new("wikidigest-link-count")
.version("0.1")
.author("github.com/adrian5")
.about("Find the most linked-to pages in MediaWiki databases")
@@ -32,74 +32,80 @@ pub fn init_cli_app() -> Result {
1. The page-table SQL dump (…page.sql.gz)\n\
2. The redirect-table SQL dump (…redirect.sql.gz)\n\
3. The pagelinks-table SQL dump (…pagelinks.sql.gz)\n\n\
- For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/"
+ For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/",
)
// Page file
- .arg(Arg::with_name("file-page")
- .short("p")
- .long("page-file")
- .value_name("PATH")
- .help("Path to ‘…page.sql(.gz)’")
- .takes_value(true)
- .required(true)
+ .arg(
+ Arg::with_name("file-page")
+ .short("p")
+ .long("page-file")
+ .value_name("PATH")
+ .help("Path to ‘…page.sql(.gz)’")
+ .takes_value(true)
+ .required(true),
)
// Redirect file
- .arg(Arg::with_name("file-redirect")
- .short("r")
- .long("redirect-file")
- .value_name("PATH")
- .help("Path to ‘…redirect.sql(.gz)’")
- .takes_value(true)
- .required(true)
+ .arg(
+ Arg::with_name("file-redirect")
+ .short("r")
+ .long("redirect-file")
+ .value_name("PATH")
+ .help("Path to ‘…redirect.sql(.gz)’")
+ .takes_value(true)
+ .required(true),
)
// Pagelinks file
- .arg(Arg::with_name("file-pagelinks")
- .short("l")
- .long("pagelinks-file")
- .value_name("PATH")
- .help("Path to ‘…pagelinks.sql(.gz)’")
- .takes_value(true)
- .required(true)
+ .arg(
+ Arg::with_name("file-pagelinks")
+ .short("l")
+ .long("pagelinks-file")
+ .value_name("PATH")
+ .help("Path to ‘…pagelinks.sql(.gz)’")
+ .takes_value(true)
+ .required(true),
)
// Output file
- .arg(Arg::with_name("file-output")
- .short("o")
- .long("output-file")
- .value_name("PATH")
- .help("Path to write results to")
- .default_value("./results")
- .takes_value(true)
+ .arg(
+ Arg::with_name("file-output")
+ .short("o")
+ .long("output-file")
+ .value_name("PATH")
+ .help("Path to write results to")
+ .default_value("./results")
+ .takes_value(true),
)
// Namespaces (From)
- .arg(Arg::with_name("namespaces-from")
- .short("f")
- .long("from-namespaces")
- .value_name("ns,ns,…")
- .help("Namespace(s) of pages from which links may originate")
- .default_value("0")
- .takes_value(true)
- .use_delimiter(true)
+ .arg(
+ Arg::with_name("namespaces-from")
+ .short("f")
+ .long("from-namespaces")
+ .value_name("ns,ns,…")
+ .help("Namespace(s) of pages from which links may originate")
+ .default_value("0")
+ .takes_value(true)
+ .use_delimiter(true),
)
// Namespaces (To)
- .arg(Arg::with_name("namespaces-to")
- .short("t")
- .long("to-namespaces")
- .value_name("ns,ns,…")
- .help("Namespace(s) of pages to which links may lead")
- .default_value("0")
- .takes_value(true)
- .use_delimiter(true)
+ .arg(
+ Arg::with_name("namespaces-to")
+ .short("t")
+ .long("to-namespaces")
+ .value_name("ns,ns,…")
+ .help("Namespace(s) of pages to which links may lead")
+ .default_value("0")
+ .takes_value(true)
+ .use_delimiter(true),
)
// Buffer size
- .arg(Arg::with_name("buf-size")
- .short("b")
- .long("bufsize")
- .value_name("MiB")
- .help("Buffer size per thread")
- .default_value("32")
- .takes_value(true)
- .validator(|t| {
- match t.parse::() {
+ .arg(
+ Arg::with_name("buf-size")
+ .short("b")
+ .long("bufsize")
+ .value_name("MiB")
+ .help("Buffer size per thread")
+ .default_value("32")
+ .takes_value(true)
+ .validator(|bs| match bs.parse::() {
Err(_) => Err("must be a positive number".to_string()),
Ok(value) => {
if value > 8 && value < 1024 {
@@ -108,24 +114,40 @@ pub fn init_cli_app() -> Result {
Err("must be between 8 and 1024".to_string())
}
}
- }
- })
+ }),
)
// Cutoff threshold
- .arg(Arg::with_name("cutoff-threshold")
- .short("c")
- .long("cutoff")
- .value_name("THRESHOLD")
- .help("Output only pages with link-count above threshold")
- .default_value("25000")
- .takes_value(true)
- .validator(|t| {
- t.parse::().map(|_| ()).map_err(|_| "must be a positive number".to_string())
- })
+ .arg(
+ Arg::with_name("cutoff-threshold")
+ .short("c")
+ .long("cutoff")
+ .value_name("THRESHOLD")
+ .help("Output only pages with link-count above threshold")
+ .default_value("25000")
+ .takes_value(true)
+ .validator(|t| {
+ t.parse::()
+ .map(|_| ())
+ .map_err(|_| "must be a positive number".to_string())
+ }),
)
// Export format
- // TODO
- .get_matches();
+ .arg(
+ Arg::with_name("export-format")
+ .short("e")
+ .long("export-as")
+ .value_name("FORMAT")
+ .help("Format to output results as")
+ .long_help("Supported formats are: text (plain), wikitext, markdown (gfm)")
+ .default_value("text")
+ .takes_value(true)
+ .validator(|f| {
+ ExportFormat::try_from(f.as_str())
+ .map(|_| ())
+ .map_err(|e| e)
+ }),
+ )
+ .get_matches();
// Conversion
let page_file = PathBuf::from_str(matches.value_of("file-page").unwrap())?;
@@ -133,14 +155,23 @@ pub fn init_cli_app() -> Result {
let pagelinks_file = PathBuf::from_str(matches.value_of("file-pagelinks").unwrap())?;
let output_file = PathBuf::from_str(matches.value_of("file-output").unwrap())?;
- let namespaces_from = matches.values_of("namespaces-from").unwrap()
- .map(|ns| PageNs(ns.parse::().unwrap()) ).collect::>();
- let namespaces_to = matches.values_of("namespaces-to").unwrap()
- .map(|ns| PageNs(ns.parse::().unwrap()) ).collect::>();
+ let namespaces_from = matches
+ .values_of("namespaces-from")
+ .unwrap()
+ .map(|ns| PageNs(ns.parse::().unwrap()))
+ .collect::>();
+ let namespaces_to = matches
+ .values_of("namespaces-to")
+ .unwrap()
+ .map(|ns| PageNs(ns.parse::().unwrap()))
+ .collect::>();
let buf_size_mib = matches.value_of("buf-size").unwrap().parse::()?;
- let cutoff_threshold = matches.value_of("cutoff-threshold").unwrap().parse::()?;
- // let export_format = TODO
+ let cutoff_threshold = matches
+ .value_of("cutoff-threshold")
+ .unwrap()
+ .parse::()?;
+ let export_format = ExportFormat::try_from(matches.value_of("export-format").unwrap()).unwrap();
let cli_params = CliParams {
page_file,
@@ -151,8 +182,8 @@ pub fn init_cli_app() -> Result {
cutoff_threshold,
namespaces_from,
namespaces_to,
- export_format: ExportFormat::PlainText,
+ export_format,
};
Ok(cli_params)
-}
\ No newline at end of file
+}
diff --git a/src/link_count.rs b/src/link_count.rs
index d87b164..d0b4b77 100644
--- a/src/link_count.rs
+++ b/src/link_count.rs
@@ -7,7 +7,7 @@ use std::ops::AddAssign;
#[derive(Clone, Copy, Debug, Default)]
pub struct LinkCount {
pub direct: u32,
- pub indirect: u32
+ pub indirect: u32,
}
impl LinkCount {
@@ -24,7 +24,7 @@ impl AddAssign for LinkCount {
fn add_assign(&mut self, other: LinkCount) {
*self = Self {
direct: self.direct + other.direct,
- indirect: self.indirect + other.indirect
+ indirect: self.indirect + other.indirect,
};
}
}
@@ -47,4 +47,4 @@ impl PartialEq for LinkCount {
fn eq(&self, other: &LinkCount) -> bool {
(self.direct + self.indirect) == (other.direct + other.indirect)
}
-}
\ No newline at end of file
+}
diff --git a/src/main.rs b/src/main.rs
index 28ad1ee..f243479 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -29,8 +29,8 @@ fn main() -> Result<()> {
// Process page-table data
let pages = {
- let f = File::open(&cli.page_file).with_context(
- || format!("Failed to open page file ‘{}’", &cli.page_file.display()))?;
+ let f = File::open(&cli.page_file)
+ .with_context(|| format!("Failed to open page file ‘{}’", &cli.page_file.display()))?;
if util::is_probably_gzip(&cli.page_file) {
page_table::collect_pages(GzDecoder::new(f), &cli.namespaces_to, buf_size)
} else {
@@ -40,8 +40,12 @@ fn main() -> Result<()> {
// Process redirect-table data
let redirects = {
- let f = File::open(&cli.redirect_file).with_context(
- || format!("Failed to open redirect file ‘{}’", &cli.redirect_file.display()))?;
+ let f = File::open(&cli.redirect_file).with_context(|| {
+ format!(
+ "Failed to open redirect file ‘{}’",
+ &cli.redirect_file.display()
+ )
+ })?;
if util::is_probably_gzip(&cli.redirect_file) {
redirect_table::map_redirects(GzDecoder::new(f), pages, &cli.namespaces_to, buf_size)
} else {
@@ -51,14 +55,26 @@ fn main() -> Result<()> {
// Process pagelinks-table data
let pagelinks = {
- let f = File::open(&cli.pagelinks_file).with_context(
- || format!("Failed to open pagelinks file ‘{}’", &cli.pagelinks_file.display()))?;
+ let f = File::open(&cli.pagelinks_file).with_context(|| {
+ format!(
+ "Failed to open pagelinks file ‘{}’",
+ &cli.pagelinks_file.display()
+ )
+ })?;
if util::is_probably_gzip(&cli.pagelinks_file) {
- pagelinks_table::count_links(GzDecoder::new(f), redirects,
- (&cli.namespaces_from, &cli.namespaces_to), buf_size)
+ pagelinks_table::count_links(
+ GzDecoder::new(f),
+ redirects,
+ (&cli.namespaces_from, &cli.namespaces_to),
+ buf_size,
+ )
} else {
- pagelinks_table::count_links(f, redirects, (&cli.namespaces_from, &cli.namespaces_to),
- buf_size)
+ pagelinks_table::count_links(
+ f,
+ redirects,
+ (&cli.namespaces_from, &cli.namespaces_to),
+ buf_size,
+ )
}
}?;
diff --git a/src/page_table.rs b/src/page_table.rs
index 070bea0..f142a62 100644
--- a/src/page_table.rs
+++ b/src/page_table.rs
@@ -5,7 +5,7 @@ use crate::{
buffer_queue::BufferQueue,
chunked_reader::ChunkedReader,
progress_display::ProgressDisplay,
- util::{self, PageId, PageNs, PageTitle}
+ util::{self, PageId, PageNs, PageTitle},
};
use ahash::AHashMap;
@@ -15,8 +15,14 @@ use regex::Regex;
use std::io::Read;
use std::sync::Mutex;
-pub fn collect_pages(source: T, namespaces: &[PageNs], buffer_size: usize)
- -> Result> where T: Read + Send {
+pub fn collect_pages(
+ source: T,
+ namespaces: &[PageNs],
+ buffer_size: usize,
+) -> Result>
+where
+ T: Read + Send,
+{
let pages: Mutex> = Mutex::new(AHashMap::new());
let mut source = ChunkedReader::new(source);
@@ -30,7 +36,10 @@ pub fn collect_pages(source: T, namespaces: &[PageNs], buffer_size: usize)
let regex = ®ex;
loop {
- eprint!("\r1/5 Extracting ‘page’ table data ({:.1} GiB processed)", progress.next());
+ eprint!(
+ "\r1/5 Extracting ‘page’ table data ({:.1} GiB processed)",
+ progress.next()
+ );
let buffer = buffers.pop();
let was_final_read = !source.read_into(&mut buffer.borrow(), buffer_size)?;
@@ -82,7 +91,9 @@ for the sake of completeness. Cost seems negligible.
fn build_page_regex(namespaces: &[PageNs]) -> Result {
let ns_str = util::namespaces_to_string(namespaces);
- let pattern = format!(r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','[a-z,:=]*?',1,", ns_str);
+ let pattern = format!(
+ r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','[a-z,:=]*?',1,",
+ ns_str
+ );
Regex::new(&pattern).context("Building page regex")
}
-
diff --git a/src/pagelinks_table.rs b/src/pagelinks_table.rs
index 5d2e8f3..d767690 100644
--- a/src/pagelinks_table.rs
+++ b/src/pagelinks_table.rs
@@ -6,7 +6,7 @@ use crate::{
chunked_reader::ChunkedReader,
link_count::LinkCount,
progress_display::ProgressDisplay,
- util::{self, PageNs, PageTitle}
+ util::{self, PageNs, PageTitle},
};
use ahash::AHashMap;
@@ -19,12 +19,18 @@ use std::sync::Mutex;
#[derive(Debug, Default)]
pub struct Counter {
pub direct: u32,
- pub indirect: u32
+ pub indirect: u32,
}
-pub fn count_links(source: T, redirects: AHashMap<(PageNs, PageTitle), PageTitle>,
- namespaces: (&[PageNs], &[PageNs]), buffer_size: usize)
- -> Result> where T: Read + Send {
+pub fn count_links(
+ source: T,
+ redirects: AHashMap<(PageNs, PageTitle), PageTitle>,
+ namespaces: (&[PageNs], &[PageNs]),
+ buffer_size: usize,
+) -> Result>
+where
+ T: Read + Send,
+{
let pagelinks: Mutex> = Mutex::new(AHashMap::new());
let mut source = ChunkedReader::new(source);
@@ -39,13 +45,17 @@ pub fn count_links(source: T, redirects: AHashMap<(PageNs, PageTitle), PageTi
let regex = ®ex;
loop {
- eprint!("\r3/5 Extracting ‘pagelinks’ table data and counting links \
- ({:.1} GiB processed)", progress.next());
+ eprint!(
+ "\r3/5 Extracting ‘pagelinks’ table data and counting links \
+ ({:.1} GiB processed)",
+ progress.next()
+ );
let buffer = buffers.pop();
let was_final_read = !source.read_into(&mut buffer.borrow(), buffer_size)?;
s.spawn_fifo(move |_| {
let mut new_pagelinks = AHashMap::<(PageNs, PageTitle), LinkCount>::new();
+
for cap in regex.captures_iter(&buffer.borrow()) {
let ns = PageNs(cap[1].parse::().unwrap());
let title = &cap[2];
@@ -60,8 +70,8 @@ pub fn count_links(source: T, redirects: AHashMap<(PageNs, PageTitle), PageTi
if let Some(link_count) = pl_query(&mut new_pagelinks, ns, title) {
link_count.direct += 1;
} else {
- new_pagelinks.insert((ns, PageTitle(title.to_string())),
- LinkCount::new(1, 0));
+ new_pagelinks
+ .insert((ns, PageTitle(title.to_string())), LinkCount::new(1, 0));
}
}
}
@@ -105,13 +115,14 @@ but it offers some protection against fauly matches in the case of erroneous dat
,(?:{}), : match the ‘pl_from_namespace’ on any of the numbers passed via the first namespaces
function parameter.
*/
-fn build_pagelinks_regex(namespaces_from: &[PageNs], namespaces_to: &[PageNs])
- -> Result {
+fn build_pagelinks_regex(namespaces_from: &[PageNs], namespaces_to: &[PageNs]) -> Result {
let ns_from_str = util::namespaces_to_string(namespaces_from);
let ns_to_str = util::namespaces_to_string(namespaces_to);
- let pattern = format!(r"\(\d+,({}),'((?:[^']|\\'){{1,255}}?)',(?:{})\)",
- ns_to_str, ns_from_str);
+ let pattern = format!(
+ r"\(\d+,({}),'((?:[^']|\\'){{1,255}}?)',(?:{})\)",
+ ns_to_str, ns_from_str
+ );
Regex::new(&pattern).context("Building pagelinks regex")
}
@@ -121,23 +132,43 @@ the lifetime of a query parameter is required to encompass that of the HashMap.
temporary &str from the regex iterator, they would require cloning into String for each query.
*/
#[inline]
-fn rd_query<'a>(rd: &'a AHashMap<(PageNs, PageTitle), PageTitle>, ns: PageNs, title: &str)
- -> Option<&'a PageTitle> {
- unsafe { // Satisfy the &(PageNs, String) interface without new allocations
- let key = (ns, PageTitle(String::from_raw_parts(title.as_ptr() as *mut u8,
- title.len(), title.len())));
+fn rd_query<'a>(
+ rd: &'a AHashMap<(PageNs, PageTitle), PageTitle>,
+ ns: PageNs,
+ title: &str,
+) -> Option<&'a PageTitle> {
+ unsafe {
+ // Satisfy the &(PageNs, String) interface without reallocations
+ let key = (
+ ns,
+ PageTitle(String::from_raw_parts(
+ title.as_ptr() as *mut u8,
+ title.len(),
+ title.len(),
+ )),
+ );
let key = std::mem::ManuallyDrop::new(key);
rd.get(&*key)
}
}
#[inline]
-fn pl_query<'a>(pl: &'a mut AHashMap<(PageNs, PageTitle), LinkCount>, ns: PageNs, title: &str)
--> Option<&'a mut LinkCount> {
- unsafe { // As above
- let key = (ns, PageTitle(String::from_raw_parts(title.as_ptr() as *mut u8,
- title.len(), title.len())));
+fn pl_query<'a>(
+ pl: &'a mut AHashMap<(PageNs, PageTitle), LinkCount>,
+ ns: PageNs,
+ title: &str,
+) -> Option<&'a mut LinkCount> {
+ unsafe {
+ // As above
+ let key = (
+ ns,
+ PageTitle(String::from_raw_parts(
+ title.as_ptr() as *mut u8,
+ title.len(),
+ title.len(),
+ )),
+ );
let key = std::mem::ManuallyDrop::new(key);
pl.get_mut(&*key)
}
-}
\ No newline at end of file
+}
diff --git a/src/progress_display.rs b/src/progress_display.rs
index ea6da1c..f6e91cc 100644
--- a/src/progress_display.rs
+++ b/src/progress_display.rs
@@ -14,7 +14,7 @@ impl ProgressDisplay {
ProgressDisplay {
progress_counter: 0,
- buffer_size_gib
+ buffer_size_gib,
}
}
@@ -23,4 +23,4 @@ impl ProgressDisplay {
self.progress_counter += 1;
out
}
-}
\ No newline at end of file
+}
diff --git a/src/redirect_table.rs b/src/redirect_table.rs
index 708bfba..8e5dd47 100644
--- a/src/redirect_table.rs
+++ b/src/redirect_table.rs
@@ -5,7 +5,7 @@ use crate::{
buffer_queue::BufferQueue,
chunked_reader::ChunkedReader,
progress_display::ProgressDisplay,
- util::{self, PageId, PageNs, PageTitle}
+ util::{self, PageId, PageNs, PageTitle},
};
use ahash::AHashMap;
@@ -15,9 +15,15 @@ use regex::Regex;
use std::io::Read;
use std::sync::Mutex;
-pub fn map_redirects(source: T, pages: AHashMap<(PageNs, PageId), PageTitle>,
- namespaces: &[PageNs], buffer_size: usize)
- -> Result> where T: Read + Send {
+pub fn map_redirects(
+ source: T,
+ pages: AHashMap<(PageNs, PageId), PageTitle>,
+ namespaces: &[PageNs],
+ buffer_size: usize,
+) -> Result>
+where
+ T: Read + Send,
+{
let redirects: Mutex> = Mutex::new(AHashMap::new());
let mut source = ChunkedReader::new(source);
@@ -32,21 +38,27 @@ pub fn map_redirects(source: T, pages: AHashMap<(PageNs, PageId), PageTitle>,
let regex = ®ex;
loop {
- eprint!("\r2/5 Extracting ‘redirect’ table data and mapping relations \
- ({:.1} GiB processed)", progress.next());
+ eprint!(
+ "\r2/5 Extracting ‘redirect’ table data and mapping relations \
+ ({:.1} GiB processed)",
+ progress.next()
+ );
let buffer = buffers.pop();
let was_final_read = !source.read_into(&mut buffer.borrow(), buffer_size)?;
s.spawn_fifo(move |_| {
let mut new_redirects: Vec<((PageNs, PageTitle), PageTitle)> = Vec::new();
+
for cap in regex.captures_iter(&buffer.borrow()) {
let source_id = PageId(cap[1].parse::().unwrap());
let source_ns = PageNs(cap[2].parse::().unwrap());
let target_title = &cap[3];
if let Some(source_title) = pages.get(&(source_ns, source_id)) {
- new_redirects.push(((source_ns, source_title.clone()),
- PageTitle(target_title.to_string())));
+ new_redirects.push((
+ (source_ns, source_title.clone()),
+ PageTitle(target_title.to_string()),
+ ));
}
}
@@ -88,7 +100,9 @@ to be empty.
fn build_redirect_regex(namespaces: &[PageNs]) -> Result {
let ns_str = util::namespaces_to_string(namespaces);
- let pattern = format!(r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','','(?:[^']|\\'){{0,255}}?'\)",
- ns_str);
+ let pattern = format!(
+ r"\((\d+),({}),'((?:[^']|\\'){{1,255}}?)','','(?:[^']|\\'){{0,255}}?'\)",
+ ns_str
+ );
Regex::new(&pattern).context("Building redirect pattern")
}
diff --git a/src/util.rs b/src/util.rs
index 0c59e96..1bb9af7 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -6,6 +6,7 @@ use crate::link_count::LinkCount;
use ahash::AHashMap;
use anyhow::Result;
+use std::convert::TryFrom;
use std::fmt;
use std::fs::File;
use std::io::Write;
@@ -34,7 +35,22 @@ impl fmt::Display for PageTitle {
#[derive(Clone, Copy)]
pub enum ExportFormat {
- PlainText
+ PlainText,
+ WikiText,
+ Markdown
+}
+
+impl TryFrom<&str> for ExportFormat {
+ type Error = String;
+
+ fn try_from(format: &str) -> Result {
+ match format {
+ "text" => Ok(Self::PlainText),
+ "wiki" => Ok(Self::WikiText),
+ "markdown" => Ok(Self::Markdown),
+ _ => Err(format!("Cannot convert ‘{}’ into ExportFormat", format)),
+ }
+ }
}
pub fn is_probably_gzip(path: &Path) -> bool {
@@ -50,12 +66,21 @@ pub fn build_output_filename(path: &Path, export_format: ExportFormat) -> PathBu
use ExportFormat::*;
let mut filename = path.to_path_buf();
- if path.extension().is_some() { // User supplied extension
- return filename
+ if path.extension().is_some() {
+ // User supplied extension
+ return filename;
}
match export_format {
- PlainText => { filename.set_extension("txt"); }
+ PlainText => {
+ filename.set_extension("txt");
+ },
+ WikiText => {
+ filename.set_extension("txt");
+ },
+ Markdown => {
+ filename.set_extension("md");
+ },
}
filename
@@ -93,9 +118,12 @@ fn underscores_to_spaces(mut s: String) -> String {
s
}
-pub fn sort_pagelinks(pagelinks: AHashMap<(PageNs, PageTitle), LinkCount>, cutoff: u32)
- -> Vec<((PageNs, PageTitle), LinkCount)> {
- let mut output: Vec<((PageNs, PageTitle), LinkCount)> = pagelinks.into_iter()
+pub fn sort_pagelinks(
+ pagelinks: AHashMap<(PageNs, PageTitle), LinkCount>,
+ cutoff: u32,
+) -> Vec<((PageNs, PageTitle), LinkCount)> {
+ let mut output: Vec<((PageNs, PageTitle), LinkCount)> = pagelinks
+ .into_iter()
.filter(|pl| pl.1.total() >= cutoff)
.collect();
@@ -103,23 +131,88 @@ pub fn sort_pagelinks(pagelinks: AHashMap<(PageNs, PageTitle), LinkCount>, cutof
output
}
-pub fn export_to_file(pages: Vec<((PageNs, PageTitle), LinkCount)>, mut file: File,
- format: ExportFormat) -> Result<()> {
+pub fn export_to_file(
+ pages: Vec<((PageNs, PageTitle), LinkCount)>,
+ mut file: File,
+ format: ExportFormat,
+) -> Result<()> {
use ExportFormat::*;
match format {
PlainText => write_plaintext(&mut file, pages)?,
+ WikiText => write_wikitext(&mut file, pages)?,
+ Markdown => write_markdown(&mut file, pages)?,
}
Ok(())
}
fn write_plaintext(file: &mut File, pages: Vec<((PageNs, PageTitle), LinkCount)>) -> Result<()> {
- writeln!(file, "page title [namespace] → links-total (direct + indirect)\n")?;
+ writeln!(
+ file,
+ "page title [namespace] → links-total (direct + indirect)\n"
+ )?;
for p in pages {
let title = underscores_to_spaces(((p.0).1).0);
- writeln!(file, "{} [{}] → {} ({} + {})",
- title, (p.0).0, p.1.total(), p.1.direct, p.1.indirect)?;
+ writeln!(
+ file,
+ "{} [{}] → {} ({} + {})",
+ title,
+ (p.0).0,
+ p.1.total(),
+ p.1.direct,
+ p.1.indirect
+ )?;
}
Ok(())
-}
\ No newline at end of file
+}
+
+fn write_wikitext(file: &mut File, pages: Vec<((PageNs, PageTitle), LinkCount)>) -> Result<()> {
+ writeln!(file, "{{|class=\"wikitable sortable\"")?;
+ writeln!(
+ file,
+ "! Page !! Ns !! Links total !! Direct !! via redirect\n|-"
+ )?;
+
+ for p in pages {
+ let title = underscores_to_spaces(((p.0).1).0);
+ writeln!(
+ file,
+ "| [[{}]] || {} || {} || {} || {}\n|-",
+ title,
+ (p.0).0,
+ p.1.total(),
+ p.1.direct,
+ p.1.indirect
+ )?;
+ }
+
+ writeln!(file, "|}}")?;
+ Ok(())
+}
+
+fn write_markdown(file: &mut File, pages: Vec<((PageNs, PageTitle), LinkCount)>) -> Result<()> {
+ // NOTE: Markdown tables are non-standard (GitHub Flavored Markdown); This function also
+ // doesn't pretty-print the table, which would require significantly more logic.
+ writeln!(
+ file,
+ "Page | Ns | Links total | Direct | via redirect\n\
+ :--- | :---: | ---: | ---: | ---:"
+ )?;
+
+ for p in pages {
+ let title = underscores_to_spaces(((p.0).1).0);
+ writeln!(
+ file,
+ "{} | {} | {} | {} | {}",
+ title,
+ (p.0).0,
+ p.1.total(),
+ p.1.direct,
+ p.1.indirect
+ )?;
+ }
+
+ Ok(())
+}
+