diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..418f728 --- /dev/null +++ b/.gitignore @@ -0,0 +1,42 @@ +## Ruby +*.gem +*.rbc +/.config +/coverage/ +/InstalledFiles +/pkg/ +/spec/reports/ +/spec/tmp/ +/spec/examples.txt +/test/tmp/ +/test/version_tmp/ +/tmp/ + +## Used by dotenv library to load environment variables +.env + +## Documentation cache and generated files: +/.yardoc/ +/_yardoc/ +/doc/ +/rdoc/ +/coverage/ + +## Environment normalization: +/.bundle/ +/vendor/bundle +/lib/bundler/man/ + +## OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +## Other +/.bundle/ +Gemfile.lock +.rvmrc \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..2f34bc3 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,4 @@ +language: ruby +before_install: gem update --system +script: bundle exec rake +rvm: 2.5 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..069de28 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,3 @@ +# 0.1.0 + +Initial commit! 🎉 \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..851fabc --- /dev/null +++ b/Gemfile @@ -0,0 +1,2 @@ +source 'https://rubygems.org' +gemspec diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..28041b1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Edwin Onuonga + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6b6c4e3 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# Arx + +A Ruby interface for querying academic papers on the arXiv search API. \ No newline at end of file diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..10119b7 --- /dev/null +++ b/Rakefile @@ -0,0 +1,7 @@ +require 'bundler/gem_tasks' +require 'rspec/core/rake_task' + +desc 'Run application specs' +RSpec::Core::RakeTask.new :spec + +task default: [:spec] \ No newline at end of file diff --git a/arx.gemspec b/arx.gemspec new file mode 100644 index 0000000..d61a8b6 --- /dev/null +++ b/arx.gemspec @@ -0,0 +1,24 @@ +lib = File.expand_path('../lib', __FILE__) +$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) +require 'arx/version' + +Gem::Specification.new do |spec| + spec.name = 'arx' + spec.version = Arx::VERSION + spec.authors = ['Edwin Onuonga'] + spec.email = ['edwinonuonga@gmail.com'] + + spec.summary = %q{A Ruby interface for querying academic papers on the arXiv search API.} + spec.license = 'MIT' + spec.require_paths = ['lib'] + spec.files = Dir.glob('lib/**/*', File::FNM_DOTMATCH) + %w[ + Gemfile LICENSE CHANGELOG.md README.md Rakefile arx.gemspec + ] + + spec.add_runtime_dependency 'nokogiri', '~> 1.10' + spec.add_runtime_dependency 'nokogiri-happymapper', '~> 0.8' + + spec.add_development_dependency 'bundler', '~> 2.0' + spec.add_development_dependency 'rake', '~> 12.3' + spec.add_development_dependency 'rspec', '~> 3.7' +end \ No newline at end of file diff --git a/lib/arx.rb b/lib/arx.rb new file mode 100644 index 0000000..a0481de --- /dev/null +++ b/lib/arx.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require 'nokogiri' +require 'open-uri' +require 'arx/version' +require 'arx/categories' +require 'arx/query/query' +require 'arx/query/validate' +require 'arx/entities/author' +require 'arx/entities/category' +require 'arx/entities/paper' + +# A Ruby interface for querying academic papers on the arXiv search API. +module Arx + + # The arXiv search API endpoint. + ENDPOINT = 'http://export.arxiv.org/api/query?' + + # Performs a search query for papers on the arXiv search API. + # + # @param ids [Array] The IDs of the arXiv papers to restrict the query to. + # @param sort_by [Symbol] The sorting criteria for the returned results (see {Query::SORT_BY}). + # @param sort_order [Symbol] The sorting order for the returned results (see {Query::SORT_ORDER}). + # @return [Array, Paper] The {Paper}(s) found by the search query. + def self.search(*ids, sort_by: :relevance, sort_order: :descending) + query = Query.new(*ids, sort_by: sort_by, sort_order: sort_order) + + yield query if block_given? + + document = Nokogiri::XML open(ENDPOINT + query.to_s + '&max_results=10000') + document.remove_namespaces! + + results = Paper.parse(document, single: false).reject {|paper| paper.id.empty?} + raise MissingPaper.new(ids.first) if results.empty? && ids.size == 1 + ids.size == 1 && results.size == 1 ? results.first : results + end +end + +# Performs a search query for papers on the arXiv search API. +# +# @note This is an alias of the {Arx.search} method. +# @see Arx.search +# @param ids [Array] The IDs of the arXiv papers to restrict the query to. +# @param sort_by [Symbol] The sorting criteria for the returned results (see {Arx::Query::SORT_BY}). +# @param sort_order [Symbol] The sorting order for the returned results (see {Arx::Query::SORT_ORDER}). +# @return [Array, Paper] The {Arx::Paper}(s) found by the search query. +def Arx(*ids, sort_by: :relevance, sort_order: :descending, &block) + if block_given? + Arx.search *ids, sort_by: sort_by, sort_order: sort_order, &block + else + Arx.search *ids, sort_by: sort_by, sort_order: sort_order + end +end \ No newline at end of file diff --git a/lib/arx/categories.rb b/lib/arx/categories.rb new file mode 100644 index 0000000..88bae94 --- /dev/null +++ b/lib/arx/categories.rb @@ -0,0 +1,161 @@ +# frozen_string_literal: true + +module Arx + + # arXiv categories and their full names + CATEGORIES = { + 'astro-ph' => 'Astrophysics', + 'astro-ph.CO' => 'Cosmology and Nongalactic Astrophysics', + 'astro-ph.EP' => 'Earth and Planetary Astrophysics', + 'astro-ph.GA' => 'Astrophysics of Galaxies', + 'astro-ph.HE' => 'High Energy Astrophysical Phenomena', + 'astro-ph.IM' => 'Instrumentation and Methods for Astrophysics', + 'astro-ph.SR' => 'Solar and Stellar Astrophysics', + 'cond-mat.dis-nn' => 'Disordered Systems and Neural Networks', + 'cond-mat.mes-hall' => 'Mesoscale and Nanoscale Physics', + 'cond-mat.mtrl-sci' => 'Materials Science', + 'cond-mat.other' => 'Other Condensed Matter', + 'cond-mat.quant-gas' => 'Quantum Gases', + 'cond-mat.soft' => 'Soft Condensed Matter', + 'cond-mat.stat-mech' => 'Statistical Mechanics', + 'cond-mat.str-el' => 'Strongly Correlated Electrons', + 'cond-mat.supr-con' => 'Superconductivity', + 'cs.AI' => 'Artificial Intelligence', + 'cs.AR' => 'Hardware Architecture', + 'cs.CC' => 'Computational Complexity', + 'cs.CE' => 'Computational Engineering, Finance, and Science', + 'cs.CG' => 'Computational Geometry', + 'cs.CL' => 'Computation and Language', + 'cs.CR' => 'Cryptography and Security', + 'cs.CV' => 'Computer Vision and Pattern Recognition', + 'cs.CY' => 'Computers and Society', + 'cs.DB' => 'Databases', + 'cs.DC' => 'Distributed, Parallel, and Cluster Computing', + 'cs.DL' => 'Digital Libraries', + 'cs.DM' => 'Discrete Mathematics', + 'cs.DS' => 'Data Structures and Algorithms', + 'cs.ET' => 'Emerging Technologies', + 'cs.FL' => 'Formal Languages and Automata Theory', + 'cs.GL' => 'General Literature', + 'cs.GR' => 'Graphics', + 'cs.GT' => 'Computer Science and Game Theory', + 'cs.HC' => 'Human-Computer Interaction', + 'cs.IR' => 'Information Retrieval', + 'cs.IT' => 'Information Theory', + 'cs.LG' => 'Learning', + 'cs.LO' => 'Logic in Computer Science', + 'cs.MA' => 'Multiagent Systems', + 'cs.MM' => 'Multimedia', + 'cs.MS' => 'Mathematical Software', + 'cs.NA' => 'Numerical Analysis', + 'cs.NE' => 'Neural and Evolutionary Computing', + 'cs.NI' => 'Networking and Internet Architecture', + 'cs.OH' => 'Other Computer Science', + 'cs.OS' => 'Operating Systems', + 'cs.PF' => 'Performance', + 'cs.PL' => 'Programming Languages', + 'cs.RO' => 'Robotics', + 'cs.SC' => 'Symbolic Computation', + 'cs.SD' => 'Sound', + 'cs.SE' => 'Software Engineering', + 'cs.SI' => 'Social and Information Networks', + 'cs.SY' => 'Systems and Control', + 'econ.EM' => 'Econometrics', + 'eess.AS' => 'Audio and Speech Processing', + 'eess.IV' => 'Image and Video Processing', + 'eess.SP' => 'Signal Processing', + 'gr-qc' => 'General Relativity and Quantum Cosmology', + 'hep-ex' => 'High Energy Physics - Experiment', + 'hep-lat' => 'High Energy Physics - Lattice', + 'hep-ph' => 'High Energy Physics - Phenomenology', + 'hep-th' => 'High Energy Physics - Theory', + 'math.AC' => 'Commutative Algebra', + 'math.AG' => 'Algebraic Geometry', + 'math.AP' => 'Analysis of PDEs', + 'math.AT' => 'Algebraic Topology', + 'math.CA' => 'Classical Analysis and ODEs', + 'math.CO' => 'Combinatorics', + 'math.CT' => 'Category Theory', + 'math.CV' => 'Complex Variables', + 'math.DG' => 'Differential Geometry', + 'math.DS' => 'Dynamical Systems', + 'math.FA' => 'Functional Analysis', + 'math.GM' => 'General Mathematics', + 'math.GN' => 'General Topology', + 'math.GR' => 'Group Theory', + 'math.GT' => 'Geometric Topology', + 'math.HO' => 'History and Overview', + 'math.IT' => 'Information Theory', + 'math.KT' => 'K-Theory and Homology', + 'math.LO' => 'Logic', + 'math.MG' => 'Metric Geometry', + 'math.MP' => 'Mathematical Physics', + 'math.NA' => 'Numerical Analysis', + 'math.NT' => 'Number Theory', + 'math.OA' => 'Operator Algebras', + 'math.OC' => 'Optimization and Control', + 'math.PR' => 'Probability', + 'math.QA' => 'Quantum Algebra', + 'math.RA' => 'Rings and Algebras', + 'math.RT' => 'Representation Theory', + 'math.SG' => 'Symplectic Geometry', + 'math.SP' => 'Spectral Theory', + 'math.ST' => 'Statistics Theory', + 'math-ph' => 'Mathematical Physics', + 'nlin.AO' => 'Adaptation and Self-Organizing Systems', + 'nlin.CD' => 'Chaotic Dynamics', + 'nlin.CG' => 'Cellular Automata and Lattice Gases', + 'nlin.PS' => 'Pattern Formation and Solitons', + 'nlin.SI' => 'Exactly Solvable and Integrable Systems', + 'nucl-ex' => 'Nuclear Experiment', + 'nucl-th' => 'Nuclear Theory', + 'physics.acc-ph' => 'Accelerator Physics', + 'physics.ao-ph' => 'Atmospheric and Oceanic Physics', + 'physics.app-ph' => 'Applied Physics', + 'physics.atm-clus' => 'Atomic and Molecular Clusters', + 'physics.atom-ph' => 'Atomic Physics', + 'physics.bio-ph' => 'Biological Physics', + 'physics.chem-ph' => 'Chemical Physics', + 'physics.class-ph' => 'Classical Physics', + 'physics.comp-ph' => 'Computational Physics', + 'physics.data-an' => 'Data Analysis, Statistics and Probability', + 'physics.ed-ph' => 'Physics Education', + 'physics.flu-dyn' => 'Fluid Dynamics', + 'physics.gen-ph' => 'General Physics', + 'physics.geo-ph' => 'Geophysics', + 'physics.hist-ph' => 'History and Philosophy of Physics', + 'physics.ins-det' => 'Instrumentation and Detectors', + 'physics.med-ph' => 'Medical Physics', + 'physics.optics' => 'Optics', + 'physics.plasm-ph' => 'Plasma Physics', + 'physics.pop-ph' => 'Popular Physics', + 'physics.soc-ph' => 'Physics and Society', + 'physics.space-ph' => 'Space Physics', + 'q-bio.BM' => 'Biomolecules', + 'q-bio.CB' => 'Cell Behavior', + 'q-bio.GN' => 'Genomics', + 'q-bio.MN' => 'Molecular Networks', + 'q-bio.NC' => 'Neurons and Cognition', + 'q-bio.OT' => 'Other Quantitative Biology', + 'q-bio.PE' => 'Populations and Evolution', + 'q-bio.QM' => 'Quantitative Methods', + 'q-bio.SC' => 'Subcellular Processes', + 'q-bio.TO' => 'Tissues and Organs', + 'q-fin.CP' => 'Computational Finance', + 'q-fin.EC' => 'Economics', + 'q-fin.GN' => 'General Finance', + 'q-fin.MF' => 'Mathematical Finance', + 'q-fin.PM' => 'Portfolio Management', + 'q-fin.PR' => 'Pricing of Securities', + 'q-fin.RM' => 'Risk Management', + 'q-fin.ST' => 'Statistical Finance', + 'q-fin.TR' => 'Trading and Market Microstructure', + 'quant-ph' => 'Quantum Physics', + 'stat.AP' => 'Applications', + 'stat.CO' => 'Computation', + 'stat.ME' => 'Methodology', + 'stat.ML' => 'Machine Learning', + 'stat.OT' => 'Other Statistics', + 'stat.TH' => 'Statistics Theory' + }.freeze +end \ No newline at end of file diff --git a/lib/arx/cleaner.rb b/lib/arx/cleaner.rb new file mode 100644 index 0000000..c8d54db --- /dev/null +++ b/lib/arx/cleaner.rb @@ -0,0 +1,13 @@ +module Arx + + # Class for cleaning strings. + class Cleaner + + # Cleans strings. + # @param [String] string Removes newline/return characters and multiple spaces from a string. + # @return [String] The cleaned string. + def self.clean(string) + string.gsub(/\r\n|\r|\n/, ' ').strip.squeeze ' ' + end + end +end \ No newline at end of file diff --git a/lib/arx/entities/author.rb b/lib/arx/entities/author.rb new file mode 100644 index 0000000..fe36ae0 --- /dev/null +++ b/lib/arx/entities/author.rb @@ -0,0 +1,29 @@ +require 'happymapper' +require 'arx/cleaner' + +module Arx + + # Entity/model representing an arXiv paper's author. + class Author + include HappyMapper + + tag 'author' + + # @!method name + # The name of the author. + # @return [String] + element :name, Cleaner, tag: 'name', parser: :clean + + # @!method affiliations + # The author's affiliations. + # @return [Array] + has_many :affiliations, Cleaner, tag: 'affiliation', parser: :clean + + # @!method affiliations? + # Whether or not the author has any affiliations. + # @return [Boolean] + def affiliations? + !affiliations.empty? + end + end +end \ No newline at end of file diff --git a/lib/arx/entities/category.rb b/lib/arx/entities/category.rb new file mode 100644 index 0000000..37231f6 --- /dev/null +++ b/lib/arx/entities/category.rb @@ -0,0 +1,24 @@ +require 'arx/categories' +require 'arx/cleaner' + +module Arx + + # Entity/model representing an arXiv paper's category. + class Category + include HappyMapper + + tag 'category' + + # @!method name + # The abbreviated name of the category. + # @return [String] + attribute :name, Cleaner, parser: :clean, tag: 'term' + + # The full name of the category. + # @see CATEGORIES + # @return [String] + def full_name + CATEGORIES[term] + end + end +end \ No newline at end of file diff --git a/lib/arx/entities/link.rb b/lib/arx/entities/link.rb new file mode 100644 index 0000000..90e820f --- /dev/null +++ b/lib/arx/entities/link.rb @@ -0,0 +1,22 @@ +require 'happymapper' + +module Arx + + # Helper entity/model representing a link on an arXiv paper. + class Link + include HappyMapper + + tag 'link' + + attribute :title, String + attribute :rel, String + attribute :type, String + attribute :href, String + + %w[pdf doi].each do |link_type| + define_method "#{link_type}?" do + @title == link_type + end + end + end +end \ No newline at end of file diff --git a/lib/arx/entities/paper.rb b/lib/arx/entities/paper.rb new file mode 100644 index 0000000..a31a5f8 --- /dev/null +++ b/lib/arx/entities/paper.rb @@ -0,0 +1,159 @@ +require 'happymapper' +require 'arx/exceptions' +require 'arx/cleaner' +require_relative 'author' +require_relative 'category' +require_relative 'link' + +module Arx + + # Entity/model representing an arXiv paper. + class Paper + include HappyMapper + + tag 'entry' + + element :id, Cleaner, parser: :clean, tag: 'id' + # The identifier of the paper. + # @note This is either in {Validate::OLD_IDENTIFIER_FORMAT} or {Validate::NEW_IDENTIFIER_FORMAT}. + # @example + # 1705.01662v1 + # cond-mat/0211034 + # @return [String] The paper's identifier. + def id + @id.sub /https?\:\/\/arxiv\.org\/abs\//, '' + end + + # The URL of the paper on the arXiv website. + # @example + # http://arxiv.org/abs/1705.01662v1 + # http://arxiv.org/abs/cond-mat/0211034 + # @return [String] The paper's arXiv URL. + def url + @id + end + + # @!method last_updated + # The date that the paper was last updated. + # @return [DateTime] + element :last_updated, DateTime, tag: 'updated' + + # @!method publish_date + # The original publish/submission date of the paper. + # @return [DateTime] + element :publish_date, DateTime, tag: 'published' + + # @!method title + # The title of the paper. + # @return [DateTime] + element :title, Cleaner, parser: :clean, tag: 'title' + + # @!method authors + # The authors of the paper. + # @return [Array] + has_many :authors, Author, tag: 'author' + + # @!method primary_category + # The primary category of the paper. + # @return [Category] + element :primary_category, Category, tag: 'primary_category' + + # @!method categories + # The categories of the paper. + # @return [Array] + has_many :categories, Category, tag: 'category' + + # Whether the paper is a revision or not. + # @note A paper is a revision if {last_updated} differs from {publish_date}. + # @return [Boolean] + def revision? + @publish_date != @last_updated + end + + # @!method summary + # The summary (or abstract) of the paper. + # @return [String] + element :summary, Cleaner, parser: :clean, tag: 'summary' + alias_method :abstract, :summary + + # @!method comment? + # Whether or not the paper has a comment. + # @return [Boolean] + + # @!method comment + # The comment of the paper. + # @note This is an optional metadata field on an arXiv paper. To check whether the paper has a comment, use {comment?} + # @raise {MissingFieldError} If the paper does not have a comment. + # @return [String] + element :comment, Cleaner, parser: :clean, tag: 'comment' + + # @!method journal? + # Whether or not the paper has a journal reference. + # @return [Boolean] + + # @!method journal + # The journal reference of the paper. + # @note This is an optional metadata field on an arXiv paper. To check whether the paper has a journal reference, use {journal?} + # @raise {MissingFieldError} If the paper does not have a journal reference. + # @return [String] + element :journal, Cleaner, parser: :clean, tag: 'journal_ref' + + %i[comment journal].each do |optional| + exists = "#{optional}?" + + define_method exists do + !instance_variable_get("@#{optional}").empty? + end + + define_method optional do + if self.send "#{optional}?" + instance_variable_get("@#{optional}") + else + raise MissingFieldError.new(optional) + end + end + end + + has_many :links, Link, tag: 'link' + + # @!method pdf? + # Whether or not the paper has a PDF link. + # @return [Boolean] + + # @!method pdf_url + # Link to the PDF version of the paper. + # @note This is an optional metadata field on an arXiv paper. To check whether the paper has a PDF link, use {pdf?} + # @raise {MissingLinkError} If the paper does not have a PDF link. + # @return [String] + + # @!method doi? + # Whether or not the paper has a DOI (Digital Object Identifier) link. + # @see https://arxiv.org/help/jref#doi + # @see https://arxiv.org/help/prep#doi + # @return [Boolean] + + # @!method doi_url + # Link to the DOI (Digital Object Identifier) of the paper. + # @see https://arxiv.org/help/jref#doi + # @see https://arxiv.org/help/prep#doi + # @note This is an optional metadata field on an arXiv paper. To check whether the paper has a DOI link, use {doi?} + # @raise {MissingLinkError} If the paper does not have a DOI link. + # @return [String] + + %i[pdf doi].each do |link_type| + exists = "#{link_type}?".to_sym + + define_method exists do + links.any? &exists + end + + define_method "#{link_type}_url" do + if self.send exists + links.find(&exists).href + else + raise MissingLinkError.new link_type.to_s.upcase + end + end + end + end +end \ No newline at end of file diff --git a/lib/arx/exceptions.rb b/lib/arx/exceptions.rb new file mode 100644 index 0000000..21663ba --- /dev/null +++ b/lib/arx/exceptions.rb @@ -0,0 +1,23 @@ +module Arx + + # Custom error for missing links on an arXiv paper. + class MissingLinkError < StandardError + def initialize(link_type) + super "This arXiv paper does not have a #{link_type} link" + end + end + + # Custom error for missing fields on an arXiv paper. + class MissingFieldError < StandardError + def initialize(field) + super "This arXiv paper is missing the `#{field}` field" + end + end + + # Custom error for missing arXiv papers. + class MissingPaper < StandardError + def initialize(id) + super "Couldn't find an arXiv paper with ID: #{id}" + end + end +end \ No newline at end of file diff --git a/lib/arx/query/query.rb b/lib/arx/query/query.rb new file mode 100644 index 0000000..21b13c4 --- /dev/null +++ b/lib/arx/query/query.rb @@ -0,0 +1,266 @@ +# frozen_string_literal: true + +require 'cgi' +require_relative 'validate' + +module Arx + + # Class for generating arXiv search API query strings. + # + # @attr query [String] The string representing the search query. + class Query + + # Mapping for URL query parameters supported by the arXiv search API. + PARAMS = { + search_query: 'search_query', + id_list: 'id_list', + sort_by: 'sortBy', + sort_order: 'sortOrder' + } + + # Logical connectives supported by the arXiv search API. + CONNECTIVES = { + and: 'AND', + or: 'OR', + and_not: 'ANDNOT' + } + + # Logical connective method names. + CONNECTIVE_METHODS = { + '&': :and, + '!': :and_not, + '|': :or + } + + # Supported fields for the search queries made to the arXiv search API. + # @see https://arxiv.org/help/prep arXiv metadata fields + # @see https://arxiv.org/help/api/user-manual#query_details arXiv user manual (query details) + FIELDS = { + title: 'ti', # Title + author: 'au', # Author + abstract: 'abs', # Abstract + comment: 'co', # Comment + journal: 'jr', # Journal reference + category: 'cat', # Subject category + report: 'rn', # Report number + all: 'all' # All (of the above) + } + + # Supported criteria for the +sortBy+ parameter. + SORT_BY = { + relevance: 'relevance', + last_updated: 'lastUpdated', + date_submitted: 'submittedDate' + } + + # Supported criteria for the +sortOrder+ parameter. + SORT_ORDER = { + ascending: 'ascending', + descending: 'descending' + } + + # Initializes a new Query object. + # + # @param ids [Array] The IDs of the arXiv papers to restrict the query to. + # @param sort_by [Symbol] The sorting criteria for the returned results (see {SORT_BY}). + # @param sort_order [Symbol] The sorting order for the returned results (see {SORT_ORDER}). + # @return [Query] The initialized query object. + def initialize(*ids, sort_by: :relevance, sort_order: :descending) + @query = String.new + + Validate.sort_by sort_by, permitted: SORT_BY.keys + @query << "&#{PARAMS[:sort_by]}=#{SORT_BY[sort_by]}" + + Validate.sort_order sort_order, permitted: SORT_ORDER.keys + @query << "&#{PARAMS[:sort_order]}=#{SORT_ORDER[sort_order]}" + + ids.flatten! + unless ids.empty? + ids.map! {|id| extract_id id} + Validate.ids ids + @query << "&#{PARAMS[:id_list]}=#{ids * ','}" + end + + yield self if block_given? + end + + # @!method & + # Logical conjunction (+AND+) of subqueries. + # @see https://arxiv.org/help/api/user-manual#query_details arXiv user manual + # @return [self] + + # @!method ! + # Logical negated conjunction (+ANDNOT+) of subqueries. + # @see https://arxiv.org/help/api/user-manual#query_details arXiv user manual + # @return [self] + + # @!method | + # Logical disjunction (+OR+) of subqueries. + # @see https://arxiv.org/help/api/user-manual#query_details arXiv user manual + # @return [self] + + CONNECTIVE_METHODS.each do |symbol, connective| + define_method(symbol) { add_connective connective } + end + + # @!method title(*values, exact: false, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#title title}. + # @param values [Array] Title(s) of papers to search for. + # @param exact [Boolean] Whether to search for an exact match of the title(s). + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method author(*values, exact: false, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#author author}. + # @param values [Array] Author(s) of papers to search for. + # @param exact [Boolean] Whether to search for an exact match of the author's name(s). + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method abstract(*values, exact: false, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#abstract abstract}. + # @param values [Array] Abstract(s) of papers to search for. + # @param exact [Boolean] Whether to search for an exact match of the abstract(s). + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method comment(*values, exact: false, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#comments comment}. + # @param values [Array] Comment(s) of papers to search for. + # @param exact [Boolean] Whether to search for an exact match of the comment(s). + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method journal(*values, exact: false, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#journal journal reference}. + # @param values [Array] Journal reference(s) of papers to search for. + # @param exact [Boolean] Whether to search for an exact match of the journal refernece(s). + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method category(*values, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#category category}. + # @param values [Array] Category(s) of papers to search for. + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method report(*values, connective: :and) + # Search for papers by {https://arxiv.org/help/prep#report report number}. + # @param values [Array] Report number(s) of papers to search for. + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + # @!method all(*values, exact: true, connective: :and) + # Search for papers by all fields (see {FIELDS}). + # @param values [Array] Field value(s) of papers to search for. + # @param exact [Boolean] Whether to search for an exact match of the comment(s). + # @param connective [Symbol] The logical connective to use (see {CONNECTIVES}). Only applies if there are multiple values. + # @return [self] + + FIELDS.each do |name, field| + define_method(name) do |*values, exact: true, connective: :and| + return if values.empty? + + Validate.values values + Validate.categories values if name == :category + Validate.exact exact + Validate.connective connective, permitted: CONNECTIVES.keys + + values.map! &CGI.method(:escape) + + # Forms a field:value pair + pair = ->(value){"#{field}:#{exact ? enquote(value) : value}"} + + subquery = if values.size > 1 + parenthesize values.map(&pair).join("+#{CONNECTIVES[connective]}+") + else + pair.(values.first) + end + + add_subquery subquery + self + end + end + + # Returns the query string. + # + # @return [String] + def to_s + @query + end + + private + + # Appends a logical connective to the end of the query string. + # + # @see CONNECTIVES + # @param connective [Symbol] The symbol of the logical connective to add. + # @return [self] + def add_connective(connective) + return unless search_query? + @query << "+#{CONNECTIVES[connective]}" unless ends_with_connective? + self + end + + # Appends a subquery to the end of the query string. + # + # @param subquery [String] The subquery to add. + def add_subquery(subquery) + if search_query? + if ends_with_connective? + @query << "+#{subquery}" + else + add_connective :and + @query << "+#{subquery}" + end + else + @query << "&#{PARAMS[:search_query]}=#{subquery}" + end + end + + # Whether the query string contains the +search_query+ parameter. + # + # @see PARAMS + # @return [Boolean] + def search_query? + @query.include? PARAMS[:search_query] + end + + # Whether the query string ends in a logical connective. + # + # @see CONNECTIVES + # @return [Boolean] + def ends_with_connective? + CONNECTIVES.values.any? &@query.method(:end_with?) + end + + # Parenthesizes a string with CGI-escaped parentheses. + # + # @param string [String] The string to parenthesize. + # @return [String] The parenthesized string. + def parenthesize(string) + CGI.escape('(') + string + CGI.escape(')') + end + + # Enquotes a string with CGI-escaped double quotes. + # + # @param string [String] The string to enquote. + # @return [String] The enquoted string. + def enquote(string) + CGI.escape("\"") + string + CGI.escape("\"") + end + + # Attempt to extract an ID from an arXiv URL. + # + # @param url [String] The URL to extract the ID from. + # @return [String] The extracted ID if successful, otherwise the original string. + def extract_id(url) + prefix = %r"^(https?\:\/\/)?(www.)?arxiv\.org\/abs\/" + if %r"#{prefix}.*$".match? url + url.sub(prefix, '').sub(%r"\/$", '') + else + url + end + end + end +end \ No newline at end of file diff --git a/lib/arx/query/validate.rb b/lib/arx/query/validate.rb new file mode 100644 index 0000000..0a602c7 --- /dev/null +++ b/lib/arx/query/validate.rb @@ -0,0 +1,123 @@ +require_relative '../categories' + +module Arx + + # Validations for arXiv search query fields and identifier schemes. + module Validate + + # The current arxiv paper identifier scheme (1 April 2007 and onwards). + # The last block of digits can either be five digits (if the paper was published after 1501 - January 2015), + # or four digits (if the paper was published before 1501). + # + # @see https://arxiv.org/help/arxiv_identifier#new arXiv identifier (new) + # @example + # 1501.00001 + # 1705.01662v1 + # 1412.0135 + # 0706.0001v2 + NEW_IDENTIFIER_FORMAT = %r"^\d{4}\.\d{4,5}(v\d+)?$" + + # The legacy arXiv paper identifier scheme (before 1 April 2007). + # + # @see https://arxiv.org/help/arxiv_identifier#old arXiv identifier (old) + # @example + # math/0309136v1 + # cond-mat/0211034 + OLD_IDENTIFIER_FORMAT = %r"^[a-z]+(\-[a-z]+)?\/\d{7}(v\d+)?$" + + class << self + # Validates the +sortBy+ field of the query string. + # + # @param value [Symbol] The value to validate. + # @param permitted [Array] Permitted values for the field. + # @raise + # [TypeError] If the value is not a +Symbol+. + # [ArgumentError] If the value is not permitted. + def sort_by(value, permitted:) + raise TypeError.new("Expected `sort_by` to be a Symbol, got: #{value.class}") unless value.is_a? Symbol + raise ArgumentError.new("Expected `sort_by` to be one of #{permitted}, got: #{value}") unless permitted.include? value + end + + # Validates the +sortOrder+ field of the query string. + # + # @param value [Symbol] The value to validate. + # @param permitted [Array] Permitted values for the field. + # @raise + # [TypeError] If the value is not a +Symbol+. + # [ArgumentError] If the value is not permitted. + def sort_order(value, permitted:) + raise TypeError.new("Expected `sort_order` to be a Symbol, got: #{value.class}") unless value.is_a? Symbol + raise ArgumentError.new("Expected `sort_order` to be one of #{permitted}, got: #{value}") unless permitted.include? value + end + + # Validates a list of arXiv paper identifiers. + # + # @param ids [Array] The identifiers to validate. + # @raise + # [TypeError] If +ids+ is not an +Array+. + # [TypeError] If any identifier is not a +String+. + # [ArgumentError] If the identifier is invalid. + def ids(ids) + raise TypeError.new("Expected `ids` to be an Array, got: #{ids.class}") unless ids.is_a? Array + ids.each do |id| + raise TypeError.new("Expected identifier to be a String, got: #{id.class} (#{id})") unless id.is_a? String + raise ArgumentError.new("Malformed arXiv identifier: #{id}") unless id? id + end + end + + # Validates the +exact+ parameter. + # + # @param value [Boolean] The value to validate. + # @raise + # [TypeError] If the value is not a boolean (+TrueClass+ or +FalseClass+). + def exact(value) + raise TypeError.new("Expected `exact` to be boolean (TrueClass or FalseClass), got: #{value.class}") unless value == !!value + end + + # Validates a logical connective. + # + # @param value [Symbol] The value to validate. + # @param permitted [Array] Permitted values for the field. + # @raise + # [TypeError] If the value is not a +Symbol+. + # [ArgumentError] If the value is not permitted. + def connective(value, permitted:) + raise TypeError.new("Expected `connective` to be a Symbol, got: #{value.class}") unless value.is_a? Symbol + raise ArgumentError.new("Expected `connective` to be one of #{permitted}, got: #{value}") unless permitted.include? value + end + + # Validates a list of values for the fields of the search query string. + # + # @param values [Array] The values to validate. + # @raise + # [TypeError] If +values+ is not an +Array+. + # [TypeError] If any value is not a +String+. + def values(values) + raise TypeError.new("Expected `values` to be an Array, got: #{values.class}") unless values.is_a? Array + values.each do |value| + raise TypeError.new("Expected value to be a String, got: #{value.class} (#{value})") unless value.is_a? String + end + end + + # Validates a list of arXiv categories. + # + # @note This is only called after {values}, so there is no need to check types. + # @param categories [Array] The categories to validate. + # @raise [ArgumentError] If any category is unrecognized (not a valid arXiv category). + # @see Arx::CATEGORIES + def categories(categories) + categories.each do |category| + raise ArgumentError.new("Unrecognized arXiv category (#{category}). See Arx::CATEGORIES.") unless Arx::CATEGORIES.keys.include? category + end + end + + # Validates an arXiv identifier of both the old and new schemes. + # + # @see NEW_IDENTIFIER_FORMAT + # @see OLD_IDENTIFIER_FORMAT + def id?(id) + NEW_IDENTIFIER_FORMAT.match?(id) || OLD_IDENTIFIER_FORMAT.match?(id) + end + end + end +end \ No newline at end of file diff --git a/lib/arx/version.rb b/lib/arx/version.rb new file mode 100644 index 0000000..e6b8843 --- /dev/null +++ b/lib/arx/version.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Arx + + # The current version of Arx. + VERSION = { + major: 0, + minor: 1, + patch: 0, + meta: nil + }.compact.values.join('.').freeze +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..077279a --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,12 @@ +$LOAD_PATH.unshift File.join __dir__, '..', 'lib' +require 'bundler/setup' +require 'arx' + +# Load support files from spec/support +Dir.glob File.join(__dir__, 'support', '*.rb'), &method(:require) + +RSpec.configure do |config| + # Configure RSpec here... +end + +include Arx \ No newline at end of file diff --git a/spec/support/.gitkeep b/spec/support/.gitkeep new file mode 100644 index 0000000..e69de29