# frozen_string_literal: true require 'elasticsearch/model' require 'json' require 'parseable_date_validator' class Paper < ActiveRecord::Base include Elasticsearch::Model include Elasticsearch::Model::Callbacks validates :name, presence: true, length: { maximum: 1000 } validates :url, presence: true, length: { maximum: 1000 }, uniqueness: true, # TODO: use unique index instead url: true validates :reference, presence: true, length: { maximum: 100 } validates :body, presence: true, length: { maximum: 100 } validates :content, presence: true, length: { maximum: 100_000 } validates :originator, presence: true, length: { maximum: 300 } validates :paper_type, presence: true, length: { maximum: 50 } validates :published_at, presence: true, parseable_date: true validates :resolution, length: { maximum: 30_000 } index_name ['srm', Rails.env, base_class.to_s.pluralize.underscore].join('_') settings index: { number_of_shards: 1, analysis: { filter: { german_stop: { type: 'stop', stopwords: '_german_' }, german_stemmer: { type: 'stemmer', language: 'light_german' }, german_decompounder: { type: 'hyphenation_decompounder', word_list_path: 'analysis/dictionary-de.txt', hyphenation_patterns_path: 'analysis/de_DR.xml', only_longest_match: true, min_subword_size: 4 } }, analyzer: { german: { tokenizer: 'standard', filter: %w[ lowercase german_stop german_decompounder german_normalization german_stemmer ] } } } } do mappings dynamic: false do indexes :name, type: :text, analyzer: 'german' indexes :content, type: :text, analyzer: 'german' indexes :resolution, type: :text, analyzer: 'german' indexes :reference, type: :keyword, index: true indexes :paper_type, type: :keyword, index: true indexes :published_at, type: :date, index: true indexes :originator, type: :keyword, index: true end end def split_originator originator.split(/\d\.\s/).reject(&:blank?) || originator end def as_indexed_json(_options = {}) as_json.merge(originator: split_originator) end class << self def import_from_json(json_string) old_count = count JSON.parse(json_string).each do |record| attributes = { body: record['body'], content: record['content'], name: record['name'], resolution: record['resolution'], originator: record['originator'], paper_type: record['paper_type'], published_at: record['published_at'], reference: record['reference'], url: record['url'] } record = find_or_initialize_by(url: attributes[:url]) record.update_attributes(attributes) end puts "Imported #{count - old_count} Papers!" end def import_from_oparl(oparl_doc) doc = JSON.parse(oparl_doc) attributes = { name: doc['name'], body: doc['body'], paper_type: doc['paperType'], reference: doc['reference'], url: doc['web'], published_at: doc['modified'], content: 'n.a.', originator: doc['leipzig:originator'] } record = find_or_initialize_by(url: attributes[:url]) record.update(attributes) ? record : nil end # use DSL to define search queries # see https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl # and https://github.com/elastic/elasticsearch-rails/tree/master/elasticsearch-rails/lib/rails/templates def search(search_definition) Rails.logger.debug "Query: #{search_definition.to_json}" __elasticsearch__.search(search_definition) end def reset_index! __elasticsearch__.create_index! force: true all.each { |p| p.__elasticsearch__.index_document } end end end