stadtratmonitor/app/models/paper.rb

# frozen_string_literal: true

require 'elasticsearch/model'
require 'json'
require 'parseable_date_validator'

class Paper < ActiveRecord::Base
  include Elasticsearch::Model
  include Elasticsearch::Model::Callbacks

  validates :name,         presence: true, length: { maximum: 1000 }
  validates :url,          presence: true,
                           length: { maximum: 1000 },
                           uniqueness: true, # TODO: use unique index instead
                           url: true
  validates :reference,    presence: true, length: { maximum: 100 }
  validates :body,         presence: true, length: { maximum: 100 }
  validates :content,      presence: true, length: { maximum: 100_000 }
  validates :originator,   presence: true, length: { maximum: 300 }
  validates :paper_type,   presence: true, length: { maximum: 50 }
  validates :published_at, presence: true, parseable_date: true
  validates :resolution,   length: { maximum: 30_000 }

  index_name ['srm', Rails.env, base_class.to_s.pluralize.underscore].join('_')

  settings index: {
    number_of_shards: 1,
    analysis: {
      filter: {
        german_stop: {
          type: 'stop',
          stopwords: '_german_'
        },
        german_stemmer: {
          type: 'stemmer',
          language: 'light_german'
        },
        german_decompounder: {
          type: 'hyphenation_decompounder',
          word_list_path: 'analysis/dictionary-de.txt',
          hyphenation_patterns_path: 'analysis/de_DR.xml',
          only_longest_match: true,
          min_subword_size: 4
        }
      },
      analyzer: {
        german: {
          tokenizer: 'standard',
          filter: %w[
            lowercase
            german_stop
            german_decompounder
            german_normalization
            german_stemmer
          ]
        }
      }
    }
  } do
    mappings dynamic: false do
      indexes :name, type: :text, analyzer: 'german'
      indexes :content, type: :text, analyzer: 'german'
      indexes :resolution, type: :text, analyzer: 'german'
      indexes :reference, type: :keyword, index: true
      indexes :paper_type, type: :keyword, index: true
      indexes :published_at, type: :date, index: true
      indexes :originator, type: :keyword, index: true
    end
  end

  def split_originator
    originator.split(/\d\.\s/).reject(&:blank?) || originator
  end

  def as_indexed_json(_options = {})
    as_json.merge(originator: split_originator)
  end

  class << self
    def import_from_json(json_string)
      old_count = count
      JSON.parse(json_string).each do |record|
        content = record['content'].truncate(99998)
        attributes = {
          body: record['body'],
          content: content,
          name: record['name'],
          resolution: record['resolution'],
          originator: record['originator'],
          paper_type: record['paper_type'],
          published_at: record['published_at'],
          reference: record['reference'],
          url: record['url']
        }
        record = find_or_initialize_by(url: attributes[:url])
        out = record.update(attributes)
        if !out
          puts "Not imported #{attributes[:name]}: #{record.errors.messages}"
        end
      end
      puts "Imported #{count - old_count} Papers!"
    end

    # use DSL to define search queries
    # see https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl
    # and https://github.com/elastic/elasticsearch-rails/tree/master/elasticsearch-rails/lib/rails/templates
    def search(search_definition)
      Rails.logger.debug "Query: #{search_definition.to_json}"
      __elasticsearch__.search(search_definition)
    end

    def reset_index!
      __elasticsearch__.create_index! force: true
      all.each { |p| p.__elasticsearch__.index_document }
    end
  end
end
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`# frozen_string_literal: true`

Replace direct ES access with elasticsearch-model 2015-04-27 21:26:37 +02:00			`require 'elasticsearch/model'`
Import JSON data from morph.io via rake task Usage: MORPH_API_KEY=‘your key’ b rake import_papers:from_morph 2015-04-27 21:03:58 +02:00			`require 'json'`
Refactor and expand Paper validations 2015-09-27 13:51:13 +02:00			`require 'parseable_date_validator'`
Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00
			`class Paper < ActiveRecord::Base`
Replace direct ES access with elasticsearch-model 2015-04-27 21:26:37 +02:00			`include Elasticsearch::Model`
			`include Elasticsearch::Model::Callbacks`

Refactor and expand Paper validations 2015-09-27 13:51:13 +02:00			`validates :name, presence: true, length: { maximum: 1000 }`
			`validates :url, presence: true,`
			`length: { maximum: 1000 },`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`uniqueness: true, # TODO: use unique index instead`
Refactor and expand Paper validations 2015-09-27 13:51:13 +02:00			`url: true`
			`validates :reference, presence: true, length: { maximum: 100 }`
			`validates :body, presence: true, length: { maximum: 100 }`
			`validates :content, presence: true, length: { maximum: 100_000 }`
			`validates :originator, presence: true, length: { maximum: 300 }`
			`validates :paper_type, presence: true, length: { maximum: 50 }`
			`validates :published_at, presence: true, parseable_date: true`
			`validates :resolution, length: { maximum: 30_000 }`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`index_name ['srm', Rails.env, base_class.to_s.pluralize.underscore].join('_')`
Create environment specific index name for papers This will require re-indexing your development database 2016-02-17 20:45:29 +01:00
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`settings index: {`
			`number_of_shards: 1,`
			`analysis: {`
			`filter: {`
			`german_stop: {`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`type: 'stop',`
			`stopwords: '_german_'`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`},`
			`german_stemmer: {`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`type: 'stemmer',`
			`language: 'light_german'`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`},`
			`german_decompounder: {`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`type: 'hyphenation_decompounder',`
			`word_list_path: 'analysis/dictionary-de.txt',`
			`hyphenation_patterns_path: 'analysis/de_DR.xml',`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`only_longest_match: true,`
			`min_subword_size: 4`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`}`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`},`
			`analyzer: {`
			`german: {`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`tokenizer: 'standard',`
			`filter: %w[`
			`lowercase`
			`german_stop`
			`german_decompounder`
			`german_normalization`
			`german_stemmer`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`]`
28 - use elasticsearch-analysis-decompound plugin to support finding results within compound words 2016-12-29 18:22:10 +01:00			`}`
			`}`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`}`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`} do`
			`mappings dynamic: false do`
			`indexes :name, type: :text, analyzer: 'german'`
			`indexes :content, type: :text, analyzer: 'german'`
			`indexes :resolution, type: :text, analyzer: 'german'`
Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings 2020-01-16 11:20:03 +01:00			`indexes :reference, type: :keyword, index: true`
			`indexes :paper_type, type: :keyword, index: true`
			`indexes :published_at, type: :date, index: true`
			`indexes :originator, type: :keyword, index: true`
paper_type facets 2015-06-13 21:18:55 +02:00			`end`
Add paper validation tests 2015-09-27 10:25:40 +02:00			`end`
paper_type facets 2015-06-13 21:18:55 +02:00
split originator on indexing 2015-06-22 23:51:36 +02:00			`def split_originator`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`originator.split(/\d\.\s/).reject(&:blank?) \|\| originator`
split originator on indexing 2015-06-22 23:51:36 +02:00			`end`

Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`def as_indexed_json(_options = {})`
split originator on indexing 2015-06-22 23:51:36 +02:00			`as_json.merge(originator: split_originator)`
			`end`

Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00			`class << self`
Import JSON data from morph.io via rake task Usage: MORPH_API_KEY=‘your key’ b rake import_papers:from_morph 2015-04-27 21:03:58 +02:00			`def import_from_json(json_string)`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00			`old_count = count`
Import JSON data from morph.io via rake task Usage: MORPH_API_KEY=‘your key’ b rake import_papers:from_morph 2015-04-27 21:03:58 +02:00			`JSON.parse(json_string).each do \|record\|`
truncate direct method 2023-04-30 01:00:24 +02:00			`content = record['content'].truncate(99998)`
Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00			`attributes = {`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00			`body: record['body'],`
truncate content 2023-04-29 23:57:26 +02:00			`content: content,`
Import JSON data from morph.io via rake task Usage: MORPH_API_KEY=‘your key’ b rake import_papers:from_morph 2015-04-27 21:03:58 +02:00			`name: record['name'],`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00			`resolution: record['resolution'],`
Import JSON data from morph.io via rake task Usage: MORPH_API_KEY=‘your key’ b rake import_papers:from_morph 2015-04-27 21:03:58 +02:00			`originator: record['originator'],`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00			`paper_type: record['paper_type'],`
Import JSON data from morph.io via rake task Usage: MORPH_API_KEY=‘your key’ b rake import_papers:from_morph 2015-04-27 21:03:58 +02:00			`published_at: record['published_at'],`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00			`reference: record['reference'],`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`url: record['url']`
Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00			`}`
Don’t import the same URI multiple times Use url as unique identifier during import 2015-04-28 00:47:26 +02:00			`record = find_or_initialize_by(url: attributes[:url])`
downgrade elasticsearch because not recognized as elastic from rails otherwise 2023-04-02 00:43:07 +02:00			`out = record.update(attributes)`
Use data originated from OParl API 2020-09-20 22:44:44 +02:00			`if !out`
			`puts "Not imported #{attributes[:name]}: #{record.errors.messages}"`
			`end`
Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00			`end`
Add importer model, validations to paper model 2015-05-17 17:49:09 +02:00			`puts "Imported #{count - old_count} Papers!"`
Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00			`end`
use dsl for search definition and fix pagination 2015-06-01 23:53:45 +02:00
			`# use DSL to define search queries`
			`# see https://github.com/elastic/elasticsearch-ruby/tree/master/elasticsearch-dsl`
			`# and https://github.com/elastic/elasticsearch-rails/tree/master/elasticsearch-rails/lib/rails/templates`
Use PaperSearch definition in controller 2015-10-05 22:13:53 +02:00			`def search(search_definition)`
			`Rails.logger.debug "Query: #{search_definition.to_json}"`
			`__elasticsearch__.search(search_definition)`
use dsl for search definition and fix pagination 2015-06-01 23:53:45 +02:00			`end`

paper_type facets 2015-06-13 21:18:55 +02:00			`def reset_index!`
			`__elasticsearch__.create_index! force: true`
Rubocop autocorrect app directory 2020-03-09 12:13:27 +01:00			`all.each { \|p\| p.__elasticsearch__.index_document }`
paper_type facets 2015-06-13 21:18:55 +02:00			`end`
Add Paper model, Paper.import_from_csv 2015-04-13 22:09:28 +02:00			`end`
			`end`