stadtratmonitor/app/models/paper.rb

130 lines
4.1 KiB
Ruby
Raw Normal View History

2020-03-09 12:13:27 +01:00
# frozen_string_literal: true
require 'elasticsearch/model'
require 'json'
2015-09-27 13:51:13 +02:00
require 'parseable_date_validator'
2015-04-13 22:09:28 +02:00
class Paper < ActiveRecord::Base
include Elasticsearch::Model
include Elasticsearch::Model::Callbacks
2015-09-27 13:51:13 +02:00
validates :name, presence: true, length: { maximum: 1000 }
validates :url, presence: true,
length: { maximum: 1000 },
2020-03-09 12:13:27 +01:00
uniqueness: true, # TODO: use unique index instead
2015-09-27 13:51:13 +02:00
url: true
validates :reference, presence: true, length: { maximum: 100 }
validates :body, presence: true, length: { maximum: 100 }
validates :content, presence: true, length: { maximum: 100_000 }
validates :originator, presence: true, length: { maximum: 300 }
validates :paper_type, presence: true, length: { maximum: 50 }
validates :published_at, presence: true, parseable_date: true
validates :resolution, length: { maximum: 30_000 }
2020-03-09 12:13:27 +01:00
index_name ['srm', Rails.env, base_class.to_s.pluralize.underscore].join('_')
settings index: {
number_of_shards: 1,
analysis: {
filter: {
german_stop: {
2020-03-09 12:13:27 +01:00
type: 'stop',
stopwords: '_german_'
},
german_stemmer: {
2020-03-09 12:13:27 +01:00
type: 'stemmer',
language: 'light_german'
},
german_decompounder: {
2020-03-09 12:13:27 +01:00
type: 'hyphenation_decompounder',
word_list_path: 'analysis/dictionary-de.txt',
hyphenation_patterns_path: 'analysis/de_DR.xml',
only_longest_match: true,
min_subword_size: 4
2020-03-09 12:13:27 +01:00
}
},
analyzer: {
german: {
2020-03-09 12:13:27 +01:00
tokenizer: 'standard',
filter: %w[
lowercase
german_stop
german_decompounder
german_normalization
german_stemmer
]
}
}
}
2020-03-09 12:13:27 +01:00
} do
mappings dynamic: false do
indexes :name, type: :text, analyzer: 'german'
indexes :content, type: :text, analyzer: 'german'
indexes :resolution, type: :text, analyzer: 'german'
indexes :reference, type: :keyword, index: true
indexes :paper_type, type: :keyword, index: true
indexes :published_at, type: :date, index: true
indexes :originator, type: :keyword, index: true
2015-06-13 21:18:55 +02:00
end
2015-09-27 10:25:40 +02:00
end
2015-06-13 21:18:55 +02:00
2015-06-22 23:51:36 +02:00
def split_originator
2020-03-09 12:13:27 +01:00
originator.split(/\d\.\s/).reject(&:blank?) || originator
2015-06-22 23:51:36 +02:00
end
2020-03-09 12:13:27 +01:00
def as_indexed_json(_options = {})
2015-06-22 23:51:36 +02:00
as_json.merge(originator: split_originator)
end
2015-04-13 22:09:28 +02:00
class << self
def import_from_json(json_string)
old_count = count
JSON.parse(json_string).each do |record|
2015-04-13 22:09:28 +02:00
attributes = {
body: record['body'],
content: record['content'],
name: record['name'],
resolution: record['resolution'],
originator: record['originator'],
paper_type: record['paper_type'],
published_at: record['published_at'],
reference: record['reference'],
2020-03-09 12:13:27 +01:00
url: record['url']
2015-04-13 22:09:28 +02:00
}
record = find_or_initialize_by(url: attributes[:url])
record.update_attributes(attributes)
2015-04-13 22:09:28 +02:00
end
puts "Imported #{count - old_count} Papers!"
2015-04-13 22:09:28 +02:00
end
def import_from_oparl(oparl_doc)
doc = JSON.parse(oparl_doc)
attributes = {
name: doc['name'],
body: doc['body'],
paper_type: doc['paperType'],
reference: doc['reference'],
url: doc['web'],
published_at: doc['modified'],
content: 'n.a.',
originator: doc['leipzig:originator']
}
record = find_or_initialize_by(url: attributes[:url])
record.update(attributes) ? record : nil
end
# use DSL to define search queries
# see https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl
# and https://github.com/elastic/elasticsearch-rails/tree/master/elasticsearch-rails/lib/rails/templates
def search(search_definition)
Rails.logger.debug "Query: #{search_definition.to_json}"
__elasticsearch__.search(search_definition)
end
2015-06-13 21:18:55 +02:00
def reset_index!
__elasticsearch__.create_index! force: true
2020-03-09 12:13:27 +01:00
all.each { |p| p.__elasticsearch__.index_document }
2015-06-13 21:18:55 +02:00
end
2015-04-13 22:09:28 +02:00
end
end