Skip to content

Commit

Permalink
Add WikiText and Markdown export formats; Apply rustfmt to entire source
Browse files Browse the repository at this point in the history
  • Loading branch information
adrian5 committed Jun 23, 2020
1 parent be8ff76 commit 3999c68
Show file tree
Hide file tree
Showing 12 changed files with 368 additions and 165 deletions.
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ The program requires three input files to operate:

For the English Wikipedia, you can get these at <https://dumps.wikimedia.org/enwiki/> as:

* enwiki-yyyy-mm-dd-page.sql.gz
* enwiki-yyyy-mm-dd-redirect.sql.gz
* enwiki-yyyy-mm-dd-pagelinks.sql.gz
* enwiki-yyyymmdd-page.sql.gz
* enwiki-yyyymmdd-redirect.sql.gz
* enwiki-yyyymmdd-pagelinks.sql.gz

### Hardware

Expand Down Expand Up @@ -62,9 +62,15 @@ Custom (128 MiB) buffer size and a link-count cutoff of 185K, below which pages
wikidigest-link-count -p page.sql.gz -r redirect.sql.gz -l pagelinks.sql.gz -o /tmp/185k-or-more -b 128 -c 185000
```

Export as different format ([WikiText](https://en.wikipedia.org/wiki/Help:Wikitext) table):

```
wikidigest-link-count -p page.sql -r redirect.sql -l pagelinks.sql -e wikitext
```

## Results

Results are written to a plain text file.
Results are written to an output file, by default as Plaintext to `./results.txt`.

Below results for the English Wikipedia, Apr 2020 – pages with 200K or more incoming links
within the main (0) namespace:
Expand Down
2 changes: 0 additions & 2 deletions notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,4 @@
- XML
- JSON
- HTML (table)
- Markdown (table)
- Wiki (table)

8 changes: 4 additions & 4 deletions src/buffer_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const QUERY_INTERVAL_MS: u64 = 250;

pub struct BufferQueue {
buffers: Vec<Mutex<String>>,
queue: Mutex<VecDeque<usize>>
queue: Mutex<VecDeque<usize>>,
}

impl BufferQueue {
Expand All @@ -25,7 +25,7 @@ impl BufferQueue {

Self {
buffers,
queue: Mutex::new(queue)
queue: Mutex::new(queue),
}
}

Expand All @@ -34,7 +34,7 @@ impl BufferQueue {
Buffer {
id,
inner: &self.buffers[id],
queue: &self.queue
queue: &self.queue,
}
}

Expand All @@ -57,7 +57,7 @@ Once released, it pushes its ID back into the shared queue, allowing the resourc
pub struct Buffer<'a> {
id: usize,
inner: &'a Mutex<String>,
queue: &'a Mutex<VecDeque<usize>>
queue: &'a Mutex<VecDeque<usize>>,
}

impl<'a> Buffer<'a> {
Expand Down
17 changes: 10 additions & 7 deletions src/chunked_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@ the door to parallel processing.
*/
use std::io::{Read, Result};

pub struct ChunkedReader<T> {
pub struct ChunkedReader<T: Read> {
source: T,
remainder: Vec<u8>,
exhausted: bool
exhausted: bool,
}

impl<T: Read> ChunkedReader<T> {
pub fn new(source: T) -> Self {
Self {
source,
remainder: Vec::new(),
exhausted: false
exhausted: false,
}
}

Expand Down Expand Up @@ -44,7 +44,8 @@ impl<T: Read> ChunkedReader<T> {
dest.set_len(target_size); // Restore to full target size
while bytes_read_total < target_size {
let bytes_read = self.source.read(&mut dest[bytes_read_total..])?;
if bytes_read == 0 { // Assume final read
if bytes_read == 0 {
// Assume final read
dest.truncate(bytes_read_total);
self.exhausted = true;
return Ok(false); // Signal final read
Expand Down Expand Up @@ -88,7 +89,8 @@ impl<T: Read> ChunkedReader<T> {
chunk.set_len(target_size); // Restore to full target size
while bytes_read_total < target_size {
let bytes_read = self.source.read(&mut chunk[bytes_read_total..])?;
if bytes_read == 0 { // Assume final read
if bytes_read == 0 {
// Assume final read
chunk.truncate(bytes_read_total);
self.exhausted = true;
break;
Expand All @@ -101,11 +103,12 @@ impl<T: Read> ChunkedReader<T> {
// Split at last newline, keep right-hand side for next chunk
self.remainder.clear();
if let Some(cutoff) = chunk.rfind('\n') {
self.remainder.extend_from_slice(&chunk.as_bytes()[cutoff..]);
self.remainder
.extend_from_slice(&chunk.as_bytes()[cutoff..]);
chunk.truncate(cutoff);
}
}

Ok(Some(chunk))
}
}
}
191 changes: 111 additions & 80 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ Parsing CLI arguments
use crate::util::{ExportFormat, PageNs};

use anyhow::Result;
use clap::{Arg, App};
use clap::{App, Arg};

use std::convert::TryFrom;
use std::path::PathBuf;
use std::str::FromStr;

Expand All @@ -18,12 +19,11 @@ pub struct CliParams {
pub namespaces_to: Vec<PageNs>,
pub buf_size_mib: usize,
pub cutoff_threshold: u32,
pub export_format: ExportFormat
pub export_format: ExportFormat,
}

pub fn init_cli_app() -> Result<CliParams> {
let matches =
App::new("wikidigest-link-count")
let matches = App::new("wikidigest-link-count")
.version("0.1")
.author("github.com/adrian5")
.about("Find the most linked-to pages in MediaWiki databases")
Expand All @@ -32,74 +32,80 @@ pub fn init_cli_app() -> Result<CliParams> {
1. The page-table SQL dump (…page.sql.gz)\n\
2. The redirect-table SQL dump (…redirect.sql.gz)\n\
3. The pagelinks-table SQL dump (…pagelinks.sql.gz)\n\n\
For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/"
For the English Wikipedia, you can get these at https://dumps.wikimedia.org/enwiki/",
)
// Page file
.arg(Arg::with_name("file-page")
.short("p")
.long("page-file")
.value_name("PATH")
.help("Path to ‘…page.sql(.gz)’")
.takes_value(true)
.required(true)
.arg(
Arg::with_name("file-page")
.short("p")
.long("page-file")
.value_name("PATH")
.help("Path to ‘…page.sql(.gz)’")
.takes_value(true)
.required(true),
)
// Redirect file
.arg(Arg::with_name("file-redirect")
.short("r")
.long("redirect-file")
.value_name("PATH")
.help("Path to ‘…redirect.sql(.gz)’")
.takes_value(true)
.required(true)
.arg(
Arg::with_name("file-redirect")
.short("r")
.long("redirect-file")
.value_name("PATH")
.help("Path to ‘…redirect.sql(.gz)’")
.takes_value(true)
.required(true),
)
// Pagelinks file
.arg(Arg::with_name("file-pagelinks")
.short("l")
.long("pagelinks-file")
.value_name("PATH")
.help("Path to ‘…pagelinks.sql(.gz)’")
.takes_value(true)
.required(true)
.arg(
Arg::with_name("file-pagelinks")
.short("l")
.long("pagelinks-file")
.value_name("PATH")
.help("Path to ‘…pagelinks.sql(.gz)’")
.takes_value(true)
.required(true),
)
// Output file
.arg(Arg::with_name("file-output")
.short("o")
.long("output-file")
.value_name("PATH")
.help("Path to write results to")
.default_value("./results")
.takes_value(true)
.arg(
Arg::with_name("file-output")
.short("o")
.long("output-file")
.value_name("PATH")
.help("Path to write results to")
.default_value("./results")
.takes_value(true),
)
// Namespaces (From)
.arg(Arg::with_name("namespaces-from")
.short("f")
.long("from-namespaces")
.value_name("ns,ns,…")
.help("Namespace(s) of pages from which links may originate")
.default_value("0")
.takes_value(true)
.use_delimiter(true)
.arg(
Arg::with_name("namespaces-from")
.short("f")
.long("from-namespaces")
.value_name("ns,ns,…")
.help("Namespace(s) of pages from which links may originate")
.default_value("0")
.takes_value(true)
.use_delimiter(true),
)
// Namespaces (To)
.arg(Arg::with_name("namespaces-to")
.short("t")
.long("to-namespaces")
.value_name("ns,ns,…")
.help("Namespace(s) of pages to which links may lead")
.default_value("0")
.takes_value(true)
.use_delimiter(true)
.arg(
Arg::with_name("namespaces-to")
.short("t")
.long("to-namespaces")
.value_name("ns,ns,…")
.help("Namespace(s) of pages to which links may lead")
.default_value("0")
.takes_value(true)
.use_delimiter(true),
)
// Buffer size
.arg(Arg::with_name("buf-size")
.short("b")
.long("bufsize")
.value_name("MiB")
.help("Buffer size per thread")
.default_value("32")
.takes_value(true)
.validator(|t| {
match t.parse::<u32>() {
.arg(
Arg::with_name("buf-size")
.short("b")
.long("bufsize")
.value_name("MiB")
.help("Buffer size per thread")
.default_value("32")
.takes_value(true)
.validator(|bs| match bs.parse::<u32>() {
Err(_) => Err("must be a positive number".to_string()),
Ok(value) => {
if value > 8 && value < 1024 {
Expand All @@ -108,39 +114,64 @@ pub fn init_cli_app() -> Result<CliParams> {
Err("must be between 8 and 1024".to_string())
}
}
}
})
}),
)
// Cutoff threshold
.arg(Arg::with_name("cutoff-threshold")
.short("c")
.long("cutoff")
.value_name("THRESHOLD")
.help("Output only pages with link-count above threshold")
.default_value("25000")
.takes_value(true)
.validator(|t| {
t.parse::<u32>().map(|_| ()).map_err(|_| "must be a positive number".to_string())
})
.arg(
Arg::with_name("cutoff-threshold")
.short("c")
.long("cutoff")
.value_name("THRESHOLD")
.help("Output only pages with link-count above threshold")
.default_value("25000")
.takes_value(true)
.validator(|t| {
t.parse::<u32>()
.map(|_| ())
.map_err(|_| "must be a positive number".to_string())
}),
)
// Export format
// TODO
.get_matches();
.arg(
Arg::with_name("export-format")
.short("e")
.long("export-as")
.value_name("FORMAT")
.help("Format to output results as")
.long_help("Supported formats are: text (plain), wikitext, markdown (gfm)")
.default_value("text")
.takes_value(true)
.validator(|f| {
ExportFormat::try_from(f.as_str())
.map(|_| ())
.map_err(|e| e)
}),
)
.get_matches();

// Conversion
let page_file = PathBuf::from_str(matches.value_of("file-page").unwrap())?;
let redirect_file = PathBuf::from_str(matches.value_of("file-redirect").unwrap())?;
let pagelinks_file = PathBuf::from_str(matches.value_of("file-pagelinks").unwrap())?;
let output_file = PathBuf::from_str(matches.value_of("file-output").unwrap())?;

let namespaces_from = matches.values_of("namespaces-from").unwrap()
.map(|ns| PageNs(ns.parse::<u32>().unwrap()) ).collect::<Vec<PageNs>>();
let namespaces_to = matches.values_of("namespaces-to").unwrap()
.map(|ns| PageNs(ns.parse::<u32>().unwrap()) ).collect::<Vec<PageNs>>();
let namespaces_from = matches
.values_of("namespaces-from")
.unwrap()
.map(|ns| PageNs(ns.parse::<u32>().unwrap()))
.collect::<Vec<PageNs>>();
let namespaces_to = matches
.values_of("namespaces-to")
.unwrap()
.map(|ns| PageNs(ns.parse::<u32>().unwrap()))
.collect::<Vec<PageNs>>();

let buf_size_mib = matches.value_of("buf-size").unwrap().parse::<usize>()?;
let cutoff_threshold = matches.value_of("cutoff-threshold").unwrap().parse::<u32>()?;
// let export_format = TODO
let cutoff_threshold = matches
.value_of("cutoff-threshold")
.unwrap()
.parse::<u32>()?;
let export_format = ExportFormat::try_from(matches.value_of("export-format").unwrap()).unwrap();

let cli_params = CliParams {
page_file,
Expand All @@ -151,8 +182,8 @@ pub fn init_cli_app() -> Result<CliParams> {
cutoff_threshold,
namespaces_from,
namespaces_to,
export_format: ExportFormat::PlainText,
export_format,
};

Ok(cli_params)
}
}
Loading

0 comments on commit 3999c68

Please sign in to comment.