Update elasticsearch

- Build our own elasticsearch image based on 6.8.6 which includes
configuration files for german decompounder from https://github.com/uschindler/german-decompounder

- move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image

- Adjust compounder filter config in papers to use the included hyphenation_decompounder.
See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html

- Remove obsolete "string" type in the mapping of papers, and replace it
with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings
This commit is contained in:
Lars Henrik Mai 2020-01-16 11:20:03 +01:00
parent 5dad333655
commit dc3a97d3b8
5 changed files with 80 additions and 62 deletions

View file

@ -31,8 +31,8 @@ gem 'omniauth'
gem 'omniauth-browserid'
gem 'elasticsearch', '~> 6'
gem 'elasticsearch-model'
gem 'elasticsearch-rails'
gem 'elasticsearch-model', '~> 6'
gem 'elasticsearch-rails', '~> 6'
gem 'elasticsearch-dsl'
gem 'leaflet-rails'

View file

@ -79,11 +79,11 @@ GEM
elasticsearch-api (6.0.2)
multi_json
elasticsearch-dsl (0.1.5)
elasticsearch-model (5.0.0)
elasticsearch-model (6.1.0)
activesupport (> 3)
elasticsearch (> 1)
hashie
elasticsearch-rails (5.0.2)
elasticsearch-rails (6.1.0)
elasticsearch-transport (6.0.2)
faraday
multi_json
@ -287,8 +287,8 @@ DEPENDENCIES
database_cleaner
elasticsearch (~> 6)
elasticsearch-dsl
elasticsearch-model
elasticsearch-rails
elasticsearch-model (~> 6)
elasticsearch-rails (~> 6)
factory_bot_rails
faker
foundation-rails (~> 5.5)

View file

@ -21,43 +21,47 @@ class Paper < ActiveRecord::Base
index_name ['srm', Rails.env, self.base_class.to_s.pluralize.underscore].join('_')
settings index: {
number_of_shards: 1,
analysis: {
filter: {
german_stop: {
type: "stop",
stopwords: "_german_"
},
german_stemmer: {
type: "stemmer",
language: "light_german"
},
decomp: {
type: "decompound"
}
},
analyzer: {
german: {
tokenizer: "standard",
filter: [
"lowercase",
"german_stop",
"german_normalization",
"german_stemmer",
"decomp"
]
}
settings index: {
number_of_shards: 1,
analysis: {
filter: {
german_stop: {
type: "stop",
stopwords: "_german_"
},
german_stemmer: {
type: "stemmer",
language: "light_german"
},
german_decompounder: {
type: "hyphenation_decompounder",
word_list_path: "analysis/dictionary-de.txt",
hyphenation_patterns_path: "analysis/de_DR.xml",
only_longest_match: true,
min_subword_size: 4
},
},
analyzer: {
german: {
tokenizer: "standard",
filter: [
"lowercase",
"german_stop",
"german_decompounder",
"german_normalization",
"german_stemmer"
]
}
}
}
} do mappings dynamic: false do
indexes :name, type: :string, analyzer: "german"
indexes :content, type: :string, analyzer: "german"
indexes :resolution, type: :string, analyzer: "german"
indexes :reference, type: :string, index: :not_analyzed
indexes :paper_type, type: :string, index: :not_analyzed
indexes :published_at, type: :date, index: :not_analyzed
indexes :originator, type: :string, index: :not_analyzed
indexes :name, type: :text, analyzer: "german"
indexes :content, type: :text, analyzer: "german"
indexes :resolution, type: :text, analyzer: "german"
indexes :reference, type: :keyword, index: true
indexes :paper_type, type: :keyword, index: true
indexes :published_at, type: :date, index: true
indexes :originator, type: :keyword, index: true
end
end

View file

@ -1,22 +1,28 @@
web:
build: .
volumes:
- .:/app
ports:
- "3000:3000"
links:
- elasticsearch
environment:
ELASTICSEARCH_URL: 'http://elasticsearch:9200'
elasticsearch:
image: elasticsearch:5.4.3
command:
- sh
- -c
- "./bin/elasticsearch-plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-analysis-decompound/5.4.3.0/elasticsearch-analysis-decompound-5.4.3.0-plugin.zip;
./bin/elasticsearch-plugin install https://github.com/royrusso/elasticsearch-HQ/zipball/master;
/docker-entrypoint.sh elasticsearch"
ports:
- "9200:9200"
volumes:
- .:/apps
version: "3.7"
volumes:
elasticsearch:
services:
web:
build: .
volumes:
- .:/app
ports:
- "3000:3000"
links:
- elasticsearch
environment:
ELASTICSEARCH_URL: 'http://elasticsearch:9200'
elasticsearch:
build: ./docker/elasticsearch
environment:
- discovery.type=single-node
ports:
- "9200:9200"
volumes:
- elasticsearch:/usr/share/elasticsearch/data
elastichq:
image: elastichq/elasticsearch-hq
ports:
- "5000:5000"
links:
- elasticsearch

View file

@ -0,0 +1,8 @@
FROM elasticsearch:6.8.6
RUN \
mkdir -p /usr/share/elasticsearch/config/analysis && \
pushd /usr/share/elasticsearch/config/analysis && \
curl -sSLO https://github.com/uschindler/german-decompounder/raw/master/de_DR.xml && \
curl -sSLO https://github.com/uschindler/german-decompounder/raw/master/dictionary-de.txt && \
popd