-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathindexer.rb
105 lines (81 loc) · 3.21 KB
/
indexer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
require 'rubygems'
require 'indextank'
require 'nokogiri'
module Jekyll
class Indexer < Generator
def initialize(config = {})
super(config)
raise ArgumentError.new 'Missing indextank_api_url.' unless config['indextank_api_url']
raise ArgumentError.new 'Missing indextank_index.' unless config['indextank_index']
@storage_dir = File.join(self.home_dir, '.jekyll_indextank')
@last_indexed_file = File.join(@storage_dir, 'last_index')
create_storage_dir()
load_last_timestamp()
@excludes = config['indextank_excludes'] || []
api = IndexTank::Client.new(config['indextank_api_url'])
@index = api.indexes(config['indextank_index'])
end
# Index all pages except pages matching any value in config['indextank_excludes']
# The main content from each page is extracted and indexed at indextank.com
# The doc_id of each indextank document will be the absolute url to the resource without domain name
def generate(site)
puts 'Indexing pages...'
# gather pages and posts
items = site.pages.dup.concat(site.posts)
# only process files that will be converted to .html and only non excluded files
items = items.find_all {|i| i.output_ext == '.html' && ! @excludes.any? {|s| (i.absolute_url =~ Regexp.new(s)) != nil } }
items.reject! {|i| i.data['exclude_from_search'] }
# only process items that are changed since last regeneration
items = items.find_all {|i| @last_indexed.nil? || File.mtime(i.full_path_to_source) > @last_indexed }
# dont process index pages
items.reject! {|i| i.is_a?(Jekyll::Page) && i.index? }
while not @index.running?
# wait for the indextank index to get ready
sleep 0.5
end
items.each do |item|
page_text = extract_text(site,item)
@index.document(item.absolute_url).add({
:text => page_text,
:title => item.data['title'] || item.name
})
puts 'Indexed ' << item.absolute_url
end
@last_indexed = Time.now
write_last_indexed()
puts 'Indexing done'
end
# render the items, parse the output and get all text inside <p> elements
def extract_text(site, page)
page.render({}, site.site_payload)
doc = Nokogiri::HTML(page.output)
paragraphs = doc.search('p').map {|e| e.text}
page_text = paragraphs.join(" ").gsub("\r"," ").gsub("\n"," ")
end
def write_last_indexed
begin
File.open(@last_indexed_file, 'w') {|f| Marshal.dump(@last_indexed, f)}
rescue
puts 'WARNING: cannot write indexed timestamps file.'
end
end
def load_last_timestamp
begin
@last_indexed = File.open(@last_indexed_file, "rb") {|f| Marshal.load(f)}
rescue
@last_indexed = nil
end
end
def create_storage_dir
begin
Dir.mkdir(@storage_dir) unless File.exists?(@storage_dir)
rescue SystemCallError
puts 'WARINING: cannot create directory to store index timestamps.'
end
end
def home_dir
homes = ["HOME", "HOMEPATH"]
ENV[homes.detect {|h| ENV[h] != nil}]
end
end
end