Update elasticsearch

- Build our own elasticsearch image based on 6.8.6 which includes
configuration files for german decompounder from https://github.com/uschindler/german-decompounder

- move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image

- Adjust compounder filter config in papers to use the included hyphenation_decompounder.
See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html

- Remove obsolete "string" type in the mapping of papers, and replace it
with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings
This commit is contained in:
Lars Henrik Mai 2020-01-16 11:20:03 +01:00
parent 5dad333655
commit dc3a97d3b8
5 changed files with 80 additions and 62 deletions

View file

@ -21,43 +21,47 @@ class Paper < ActiveRecord::Base
index_name ['srm', Rails.env, self.base_class.to_s.pluralize.underscore].join('_')
settings index: {
number_of_shards: 1,
analysis: {
filter: {
german_stop: {
type: "stop",
stopwords: "_german_"
},
german_stemmer: {
type: "stemmer",
language: "light_german"
},
decomp: {
type: "decompound"
}
},
analyzer: {
german: {
tokenizer: "standard",
filter: [
"lowercase",
"german_stop",
"german_normalization",
"german_stemmer",
"decomp"
]
}
settings index: {
number_of_shards: 1,
analysis: {
filter: {
german_stop: {
type: "stop",
stopwords: "_german_"
},
german_stemmer: {
type: "stemmer",
language: "light_german"
},
german_decompounder: {
type: "hyphenation_decompounder",
word_list_path: "analysis/dictionary-de.txt",
hyphenation_patterns_path: "analysis/de_DR.xml",
only_longest_match: true,
min_subword_size: 4
},
},
analyzer: {
german: {
tokenizer: "standard",
filter: [
"lowercase",
"german_stop",
"german_decompounder",
"german_normalization",
"german_stemmer"
]
}
}
}
} do mappings dynamic: false do
indexes :name, type: :string, analyzer: "german"
indexes :content, type: :string, analyzer: "german"
indexes :resolution, type: :string, analyzer: "german"
indexes :reference, type: :string, index: :not_analyzed
indexes :paper_type, type: :string, index: :not_analyzed
indexes :published_at, type: :date, index: :not_analyzed
indexes :originator, type: :string, index: :not_analyzed
indexes :name, type: :text, analyzer: "german"
indexes :content, type: :text, analyzer: "german"
indexes :resolution, type: :text, analyzer: "german"
indexes :reference, type: :keyword, index: true
indexes :paper_type, type: :keyword, index: true
indexes :published_at, type: :date, index: true
indexes :originator, type: :keyword, index: true
end
end