2020-03-09 12:13:27 +01:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2015-04-27 21:26:37 +02:00
|
|
|
require 'elasticsearch/model'
|
2015-04-27 21:03:58 +02:00
|
|
|
require 'json'
|
2015-09-27 13:51:13 +02:00
|
|
|
require 'parseable_date_validator'
|
2015-04-13 22:09:28 +02:00
|
|
|
|
|
|
|
class Paper < ActiveRecord::Base
|
2015-04-27 21:26:37 +02:00
|
|
|
include Elasticsearch::Model
|
|
|
|
include Elasticsearch::Model::Callbacks
|
|
|
|
|
2015-09-27 13:51:13 +02:00
|
|
|
validates :name, presence: true, length: { maximum: 1000 }
|
|
|
|
validates :url, presence: true,
|
|
|
|
length: { maximum: 1000 },
|
2020-03-09 12:13:27 +01:00
|
|
|
uniqueness: true, # TODO: use unique index instead
|
2015-09-27 13:51:13 +02:00
|
|
|
url: true
|
|
|
|
validates :reference, presence: true, length: { maximum: 100 }
|
|
|
|
validates :body, presence: true, length: { maximum: 100 }
|
|
|
|
validates :content, presence: true, length: { maximum: 100_000 }
|
|
|
|
validates :originator, presence: true, length: { maximum: 300 }
|
|
|
|
validates :paper_type, presence: true, length: { maximum: 50 }
|
|
|
|
validates :published_at, presence: true, parseable_date: true
|
|
|
|
validates :resolution, length: { maximum: 30_000 }
|
2015-05-17 17:49:09 +02:00
|
|
|
|
2020-03-09 12:13:27 +01:00
|
|
|
index_name ['srm', Rails.env, base_class.to_s.pluralize.underscore].join('_')
|
2016-02-17 20:45:29 +01:00
|
|
|
|
2020-01-16 11:20:03 +01:00
|
|
|
settings index: {
|
|
|
|
number_of_shards: 1,
|
|
|
|
analysis: {
|
|
|
|
filter: {
|
|
|
|
german_stop: {
|
2020-03-09 12:13:27 +01:00
|
|
|
type: 'stop',
|
|
|
|
stopwords: '_german_'
|
2020-01-16 11:20:03 +01:00
|
|
|
},
|
|
|
|
german_stemmer: {
|
2020-03-09 12:13:27 +01:00
|
|
|
type: 'stemmer',
|
|
|
|
language: 'light_german'
|
2020-01-16 11:20:03 +01:00
|
|
|
},
|
|
|
|
german_decompounder: {
|
2020-03-09 12:13:27 +01:00
|
|
|
type: 'hyphenation_decompounder',
|
|
|
|
word_list_path: 'analysis/dictionary-de.txt',
|
|
|
|
hyphenation_patterns_path: 'analysis/de_DR.xml',
|
2020-01-16 11:20:03 +01:00
|
|
|
only_longest_match: true,
|
|
|
|
min_subword_size: 4
|
2020-03-09 12:13:27 +01:00
|
|
|
}
|
2020-01-16 11:20:03 +01:00
|
|
|
},
|
|
|
|
analyzer: {
|
|
|
|
german: {
|
2020-03-09 12:13:27 +01:00
|
|
|
tokenizer: 'standard',
|
|
|
|
filter: %w[
|
|
|
|
lowercase
|
|
|
|
german_stop
|
|
|
|
german_decompounder
|
|
|
|
german_normalization
|
|
|
|
german_stemmer
|
2020-01-16 11:20:03 +01:00
|
|
|
]
|
2016-12-29 18:22:10 +01:00
|
|
|
}
|
|
|
|
}
|
2020-01-16 11:20:03 +01:00
|
|
|
}
|
2020-03-09 12:13:27 +01:00
|
|
|
} do
|
|
|
|
mappings dynamic: false do
|
|
|
|
indexes :name, type: :text, analyzer: 'german'
|
|
|
|
indexes :content, type: :text, analyzer: 'german'
|
|
|
|
indexes :resolution, type: :text, analyzer: 'german'
|
2020-01-16 11:20:03 +01:00
|
|
|
indexes :reference, type: :keyword, index: true
|
|
|
|
indexes :paper_type, type: :keyword, index: true
|
|
|
|
indexes :published_at, type: :date, index: true
|
|
|
|
indexes :originator, type: :keyword, index: true
|
2015-06-13 21:18:55 +02:00
|
|
|
end
|
2015-09-27 10:25:40 +02:00
|
|
|
end
|
2015-06-13 21:18:55 +02:00
|
|
|
|
2015-06-22 23:51:36 +02:00
|
|
|
def split_originator
|
2020-03-09 12:13:27 +01:00
|
|
|
originator.split(/\d\.\s/).reject(&:blank?) || originator
|
2015-06-22 23:51:36 +02:00
|
|
|
end
|
|
|
|
|
2020-03-09 12:13:27 +01:00
|
|
|
def as_indexed_json(_options = {})
|
2015-06-22 23:51:36 +02:00
|
|
|
as_json.merge(originator: split_originator)
|
|
|
|
end
|
|
|
|
|
2015-04-13 22:09:28 +02:00
|
|
|
class << self
|
2015-04-27 21:03:58 +02:00
|
|
|
def import_from_json(json_string)
|
2015-05-17 17:49:09 +02:00
|
|
|
old_count = count
|
2015-04-27 21:03:58 +02:00
|
|
|
JSON.parse(json_string).each do |record|
|
2023-04-30 01:00:24 +02:00
|
|
|
content = record['content'].truncate(99998)
|
2015-04-13 22:09:28 +02:00
|
|
|
attributes = {
|
2015-05-17 17:49:09 +02:00
|
|
|
body: record['body'],
|
2023-04-29 23:57:26 +02:00
|
|
|
content: content,
|
2015-04-27 21:03:58 +02:00
|
|
|
name: record['name'],
|
2015-05-17 17:49:09 +02:00
|
|
|
resolution: record['resolution'],
|
2015-04-27 21:03:58 +02:00
|
|
|
originator: record['originator'],
|
2015-05-17 17:49:09 +02:00
|
|
|
paper_type: record['paper_type'],
|
2015-04-27 21:03:58 +02:00
|
|
|
published_at: record['published_at'],
|
2015-05-17 17:49:09 +02:00
|
|
|
reference: record['reference'],
|
2020-03-09 12:13:27 +01:00
|
|
|
url: record['url']
|
2015-04-13 22:09:28 +02:00
|
|
|
}
|
2015-04-28 00:47:26 +02:00
|
|
|
record = find_or_initialize_by(url: attributes[:url])
|
2023-04-02 00:43:07 +02:00
|
|
|
out = record.update(attributes)
|
2020-09-20 22:44:44 +02:00
|
|
|
if !out
|
|
|
|
puts "Not imported #{attributes[:name]}: #{record.errors.messages}"
|
|
|
|
end
|
2015-04-13 22:09:28 +02:00
|
|
|
end
|
2015-05-17 17:49:09 +02:00
|
|
|
puts "Imported #{count - old_count} Papers!"
|
2015-04-13 22:09:28 +02:00
|
|
|
end
|
2015-06-01 23:53:45 +02:00
|
|
|
|
|
|
|
# use DSL to define search queries
|
|
|
|
# see https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl
|
|
|
|
# and https://github.com/elastic/elasticsearch-rails/tree/master/elasticsearch-rails/lib/rails/templates
|
2015-10-05 22:13:53 +02:00
|
|
|
def search(search_definition)
|
|
|
|
Rails.logger.debug "Query: #{search_definition.to_json}"
|
|
|
|
__elasticsearch__.search(search_definition)
|
2015-06-01 23:53:45 +02:00
|
|
|
end
|
|
|
|
|
2015-06-13 21:18:55 +02:00
|
|
|
def reset_index!
|
|
|
|
__elasticsearch__.create_index! force: true
|
2020-03-09 12:13:27 +01:00
|
|
|
all.each { |p| p.__elasticsearch__.index_document }
|
2015-06-13 21:18:55 +02:00
|
|
|
end
|
2015-04-13 22:09:28 +02:00
|
|
|
end
|
|
|
|
end
|