Update elasticsearch

- Build our own elasticsearch image based on 6.8.6 which includes
configuration files for german decompounder from https://github.com/uschindler/german-decompounder

- move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image

- Adjust compounder filter config in papers to use the included hyphenation_decompounder.
See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html

- Remove obsolete "string" type in the mapping of papers, and replace it
with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings
This commit is contained in:
Lars Henrik Mai 2020-01-16 11:20:03 +01:00
parent 5dad333655
commit dc3a97d3b8
5 changed files with 80 additions and 62 deletions

View file

@ -31,8 +31,8 @@ gem 'omniauth'
gem 'omniauth-browserid' gem 'omniauth-browserid'
gem 'elasticsearch', '~> 6' gem 'elasticsearch', '~> 6'
gem 'elasticsearch-model' gem 'elasticsearch-model', '~> 6'
gem 'elasticsearch-rails' gem 'elasticsearch-rails', '~> 6'
gem 'elasticsearch-dsl' gem 'elasticsearch-dsl'
gem 'leaflet-rails' gem 'leaflet-rails'

View file

@ -79,11 +79,11 @@ GEM
elasticsearch-api (6.0.2) elasticsearch-api (6.0.2)
multi_json multi_json
elasticsearch-dsl (0.1.5) elasticsearch-dsl (0.1.5)
elasticsearch-model (5.0.0) elasticsearch-model (6.1.0)
activesupport (> 3) activesupport (> 3)
elasticsearch (> 1) elasticsearch (> 1)
hashie hashie
elasticsearch-rails (5.0.2) elasticsearch-rails (6.1.0)
elasticsearch-transport (6.0.2) elasticsearch-transport (6.0.2)
faraday faraday
multi_json multi_json
@ -287,8 +287,8 @@ DEPENDENCIES
database_cleaner database_cleaner
elasticsearch (~> 6) elasticsearch (~> 6)
elasticsearch-dsl elasticsearch-dsl
elasticsearch-model elasticsearch-model (~> 6)
elasticsearch-rails elasticsearch-rails (~> 6)
factory_bot_rails factory_bot_rails
faker faker
foundation-rails (~> 5.5) foundation-rails (~> 5.5)

View file

@ -22,42 +22,46 @@ class Paper < ActiveRecord::Base
index_name ['srm', Rails.env, self.base_class.to_s.pluralize.underscore].join('_') index_name ['srm', Rails.env, self.base_class.to_s.pluralize.underscore].join('_')
settings index: { settings index: {
number_of_shards: 1, number_of_shards: 1,
analysis: { analysis: {
filter: { filter: {
german_stop: { german_stop: {
type: "stop", type: "stop",
stopwords: "_german_" stopwords: "_german_"
},
german_stemmer: {
type: "stemmer",
language: "light_german"
},
decomp: {
type: "decompound"
}
}, },
analyzer: { german_stemmer: {
german: { type: "stemmer",
tokenizer: "standard", language: "light_german"
filter: [ },
"lowercase", german_decompounder: {
"german_stop", type: "hyphenation_decompounder",
"german_normalization", word_list_path: "analysis/dictionary-de.txt",
"german_stemmer", hyphenation_patterns_path: "analysis/de_DR.xml",
"decomp" only_longest_match: true,
] min_subword_size: 4
} },
},
analyzer: {
german: {
tokenizer: "standard",
filter: [
"lowercase",
"german_stop",
"german_decompounder",
"german_normalization",
"german_stemmer"
]
} }
} }
}
} do mappings dynamic: false do } do mappings dynamic: false do
indexes :name, type: :string, analyzer: "german" indexes :name, type: :text, analyzer: "german"
indexes :content, type: :string, analyzer: "german" indexes :content, type: :text, analyzer: "german"
indexes :resolution, type: :string, analyzer: "german" indexes :resolution, type: :text, analyzer: "german"
indexes :reference, type: :string, index: :not_analyzed indexes :reference, type: :keyword, index: true
indexes :paper_type, type: :string, index: :not_analyzed indexes :paper_type, type: :keyword, index: true
indexes :published_at, type: :date, index: :not_analyzed indexes :published_at, type: :date, index: true
indexes :originator, type: :string, index: :not_analyzed indexes :originator, type: :keyword, index: true
end end
end end

View file

@ -1,22 +1,28 @@
web: version: "3.7"
build: . volumes:
volumes: elasticsearch:
- .:/app services:
ports: web:
- "3000:3000" build: .
links: volumes:
- elasticsearch - .:/app
environment: ports:
ELASTICSEARCH_URL: 'http://elasticsearch:9200' - "3000:3000"
elasticsearch: links:
image: elasticsearch:5.4.3 - elasticsearch
command: environment:
- sh ELASTICSEARCH_URL: 'http://elasticsearch:9200'
- -c elasticsearch:
- "./bin/elasticsearch-plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-analysis-decompound/5.4.3.0/elasticsearch-analysis-decompound-5.4.3.0-plugin.zip; build: ./docker/elasticsearch
./bin/elasticsearch-plugin install https://github.com/royrusso/elasticsearch-HQ/zipball/master; environment:
/docker-entrypoint.sh elasticsearch" - discovery.type=single-node
ports: ports:
- "9200:9200" - "9200:9200"
volumes: volumes:
- .:/apps - elasticsearch:/usr/share/elasticsearch/data
elastichq:
image: elastichq/elasticsearch-hq
ports:
- "5000:5000"
links:
- elasticsearch

View file

@ -0,0 +1,8 @@
FROM elasticsearch:6.8.6
RUN \
mkdir -p /usr/share/elasticsearch/config/analysis && \
pushd /usr/share/elasticsearch/config/analysis && \
curl -sSLO https://github.com/uschindler/german-decompounder/raw/master/de_DR.xml && \
curl -sSLO https://github.com/uschindler/german-decompounder/raw/master/dictionary-de.txt && \
popd