stadtratmonitor/app/models/paper.rb
2020-03-11 20:48:10 +01:00

129 lines
4.1 KiB
Ruby

# frozen_string_literal: true
require 'elasticsearch/model'
require 'json'
require 'parseable_date_validator'
class Paper < ActiveRecord::Base
include Elasticsearch::Model
include Elasticsearch::Model::Callbacks
validates :name, presence: true, length: { maximum: 1000 }
validates :url, presence: true,
length: { maximum: 1000 },
uniqueness: true, # TODO: use unique index instead
url: true
validates :reference, presence: true, length: { maximum: 100 }
validates :body, presence: true, length: { maximum: 100 }
validates :content, presence: true, length: { maximum: 100_000 }
validates :originator, presence: true, length: { maximum: 300 }
validates :paper_type, presence: true, length: { maximum: 50 }
validates :published_at, presence: true, parseable_date: true
validates :resolution, length: { maximum: 30_000 }
index_name ['srm', Rails.env, base_class.to_s.pluralize.underscore].join('_')
settings index: {
number_of_shards: 1,
analysis: {
filter: {
german_stop: {
type: 'stop',
stopwords: '_german_'
},
german_stemmer: {
type: 'stemmer',
language: 'light_german'
},
german_decompounder: {
type: 'hyphenation_decompounder',
word_list_path: 'analysis/dictionary-de.txt',
hyphenation_patterns_path: 'analysis/de_DR.xml',
only_longest_match: true,
min_subword_size: 4
}
},
analyzer: {
german: {
tokenizer: 'standard',
filter: %w[
lowercase
german_stop
german_decompounder
german_normalization
german_stemmer
]
}
}
}
} do
mappings dynamic: false do
indexes :name, type: :text, analyzer: 'german'
indexes :content, type: :text, analyzer: 'german'
indexes :resolution, type: :text, analyzer: 'german'
indexes :reference, type: :keyword, index: true
indexes :paper_type, type: :keyword, index: true
indexes :published_at, type: :date, index: true
indexes :originator, type: :keyword, index: true
end
end
def split_originator
originator.split(/\d\.\s/).reject(&:blank?) || originator
end
def as_indexed_json(_options = {})
as_json.merge(originator: split_originator)
end
class << self
def import_from_json(json_string)
old_count = count
JSON.parse(json_string).each do |record|
attributes = {
body: record['body'],
content: record['content'],
name: record['name'],
resolution: record['resolution'],
originator: record['originator'],
paper_type: record['paper_type'],
published_at: record['published_at'],
reference: record['reference'],
url: record['url']
}
record = find_or_initialize_by(url: attributes[:url])
record.update_attributes(attributes)
end
puts "Imported #{count - old_count} Papers!"
end
def import_from_oparl(oparl_doc)
doc = JSON.parse(oparl_doc)
attributes = {
name: doc['name'],
body: doc['body'],
paper_type: doc['paperType'],
reference: doc['reference'],
url: doc['web'],
published_at: doc['modified'],
content: 'n.a.',
originator: doc['leipzig:originator']
}
record = find_or_initialize_by(url: attributes[:url])
record.update(attributes) ? record : nil
end
# use DSL to define search queries
# see https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl
# and https://github.com/elastic/elasticsearch-rails/tree/master/elasticsearch-rails/lib/rails/templates
def search(search_definition)
Rails.logger.debug "Query: #{search_definition.to_json}"
__elasticsearch__.search(search_definition)
end
def reset_index!
__elasticsearch__.create_index! force: true
all.each { |p| p.__elasticsearch__.index_document }
end
end
end