documentcloud · ntodd · Dec 18, 2014 · Dec 18, 2014
diff --git a/index.html b/index.html
@@ -159,6 +159,11 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         <tt>aptitude install libreoffice</tt><br />
         On the Mac, download and install <a href="http://www.libreoffice.org/download">the latest release</a>.
       </li>
+      <li>
+        (Optional) Install <a href="http://www.gnu.org/software/parallel/">Parallel</a>:<br>
+        <tt>[aptitude | port | brew] install parallel</tt><br>
+        Parallel speeds up OCR text extraction of documents by processing pages in parallel.
+      </li>
     </ol>
 
     <p><i>

diff --git a/lib/docsplit.rb b/lib/docsplit.rb
@@ -13,11 +13,11 @@ module Docsplit
   ESCAPED_ROOT  = ESCAPE[ROOT]
 
   METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
-
-  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
+  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false, :parallel => false}
+
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
   DEPENDENCIES.each_key do |dep|
@@ -75,7 +75,7 @@ def self.extract_#{key}(pdfs, opts={})
       end
     EOS
   end
-  
+
   def self.extract_info(pdfs, opts={})
     pdfs = ensure_pdfs(pdfs)
     InfoExtractor.new.extract_all(pdfs, opts)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -72,18 +72,22 @@ def extract_from_ocr(pdf, pages)
           FileUtils.remove_entry_secure tiff
         end
       else
-        tiff = "#{tempdir}/#{@pdf_name}.tif"
-        escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
-        clean_text(base_path + '.txt') if @clean_ocr
+        if DEPENDENCIES[:parallel]
+          run "MAGICK_TMPDIR=#{tempdir} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{tempdir}/page_%d.tiff 2>&1"
+          run "parallel tesseract -l #{@language} #{psm} {} {.} ::: #{tempdir}/page_*.tiff 2>&1"
+          run "cat #{tempdir}/page_*.txt >'#{base_path}.txt' 2>&1"
+        else
+          tiff = "#{tempdir}/#{@pdf_name}.tif"
+          escaped_tiff = ESCAPE[tiff]
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+          run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
+        end
+        clean_text("#{base_path}.txt") if @clean_ocr
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
 
-
     private
 
     def clean_text(file)