From dc3a97d3b86f59a304063dad60e8c97f9252b96e Mon Sep 17 00:00:00 2001 From: Lars Henrik Mai Date: Thu, 16 Jan 2020 11:20:03 +0100 Subject: [PATCH] Update elasticsearch - Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings --- Gemfile | 4 +- Gemfile.lock | 8 ++-- app/models/paper.rb | 72 +++++++++++++++++---------------- docker-compose.yml | 50 +++++++++++++---------- docker/elasticsearch/Dockerfile | 8 ++++ 5 files changed, 80 insertions(+), 62 deletions(-) create mode 100644 docker/elasticsearch/Dockerfile diff --git a/Gemfile b/Gemfile index 8fd436c..809ac63 100644 --- a/Gemfile +++ b/Gemfile @@ -31,8 +31,8 @@ gem 'omniauth' gem 'omniauth-browserid' gem 'elasticsearch', '~> 6' -gem 'elasticsearch-model' -gem 'elasticsearch-rails' +gem 'elasticsearch-model', '~> 6' +gem 'elasticsearch-rails', '~> 6' gem 'elasticsearch-dsl' gem 'leaflet-rails' diff --git a/Gemfile.lock b/Gemfile.lock index 0b6f194..fd4a7f2 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -79,11 +79,11 @@ GEM elasticsearch-api (6.0.2) multi_json elasticsearch-dsl (0.1.5) - elasticsearch-model (5.0.0) + elasticsearch-model (6.1.0) activesupport (> 3) elasticsearch (> 1) hashie - elasticsearch-rails (5.0.2) + elasticsearch-rails (6.1.0) elasticsearch-transport (6.0.2) faraday multi_json @@ -287,8 +287,8 @@ DEPENDENCIES database_cleaner elasticsearch (~> 6) elasticsearch-dsl - elasticsearch-model - elasticsearch-rails + elasticsearch-model (~> 6) + elasticsearch-rails (~> 6) factory_bot_rails faker foundation-rails (~> 5.5) diff --git a/app/models/paper.rb b/app/models/paper.rb index d01bae2..80c2fa8 100644 --- a/app/models/paper.rb +++ b/app/models/paper.rb @@ -21,43 +21,47 @@ class Paper < ActiveRecord::Base index_name ['srm', Rails.env, self.base_class.to_s.pluralize.underscore].join('_') - settings index: { - number_of_shards: 1, - analysis: { - filter: { - german_stop: { - type: "stop", - stopwords: "_german_" - }, - german_stemmer: { - type: "stemmer", - language: "light_german" - }, - decomp: { - type: "decompound" - } - }, - analyzer: { - german: { - tokenizer: "standard", - filter: [ - "lowercase", - "german_stop", - "german_normalization", - "german_stemmer", - "decomp" - ] - } + settings index: { + number_of_shards: 1, + analysis: { + filter: { + german_stop: { + type: "stop", + stopwords: "_german_" + }, + german_stemmer: { + type: "stemmer", + language: "light_german" + }, + german_decompounder: { + type: "hyphenation_decompounder", + word_list_path: "analysis/dictionary-de.txt", + hyphenation_patterns_path: "analysis/de_DR.xml", + only_longest_match: true, + min_subword_size: 4 + }, + }, + analyzer: { + german: { + tokenizer: "standard", + filter: [ + "lowercase", + "german_stop", + "german_decompounder", + "german_normalization", + "german_stemmer" + ] } } + } } do mappings dynamic: false do - indexes :name, type: :string, analyzer: "german" - indexes :content, type: :string, analyzer: "german" - indexes :resolution, type: :string, analyzer: "german" - indexes :reference, type: :string, index: :not_analyzed - indexes :paper_type, type: :string, index: :not_analyzed - indexes :published_at, type: :date, index: :not_analyzed - indexes :originator, type: :string, index: :not_analyzed + indexes :name, type: :text, analyzer: "german" + indexes :content, type: :text, analyzer: "german" + indexes :resolution, type: :text, analyzer: "german" + indexes :reference, type: :keyword, index: true + indexes :paper_type, type: :keyword, index: true + indexes :published_at, type: :date, index: true + indexes :originator, type: :keyword, index: true end end diff --git a/docker-compose.yml b/docker-compose.yml index 7c64229..0ddb81e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,22 +1,28 @@ -web: - build: . - volumes: - - .:/app - ports: - - "3000:3000" - links: - - elasticsearch - environment: - ELASTICSEARCH_URL: 'http://elasticsearch:9200' -elasticsearch: - image: elasticsearch:5.4.3 - command: - - sh - - -c - - "./bin/elasticsearch-plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-analysis-decompound/5.4.3.0/elasticsearch-analysis-decompound-5.4.3.0-plugin.zip; - ./bin/elasticsearch-plugin install https://github.com/royrusso/elasticsearch-HQ/zipball/master; - /docker-entrypoint.sh elasticsearch" - ports: - - "9200:9200" - volumes: - - .:/apps +version: "3.7" +volumes: + elasticsearch: +services: + web: + build: . + volumes: + - .:/app + ports: + - "3000:3000" + links: + - elasticsearch + environment: + ELASTICSEARCH_URL: 'http://elasticsearch:9200' + elasticsearch: + build: ./docker/elasticsearch + environment: + - discovery.type=single-node + ports: + - "9200:9200" + volumes: + - elasticsearch:/usr/share/elasticsearch/data + elastichq: + image: elastichq/elasticsearch-hq + ports: + - "5000:5000" + links: + - elasticsearch diff --git a/docker/elasticsearch/Dockerfile b/docker/elasticsearch/Dockerfile new file mode 100644 index 0000000..9e0fed7 --- /dev/null +++ b/docker/elasticsearch/Dockerfile @@ -0,0 +1,8 @@ +FROM elasticsearch:6.8.6 + +RUN \ + mkdir -p /usr/share/elasticsearch/config/analysis && \ + pushd /usr/share/elasticsearch/config/analysis && \ + curl -sSLO https://github.com/uschindler/german-decompounder/raw/master/de_DR.xml && \ + curl -sSLO https://github.com/uschindler/german-decompounder/raw/master/dictionary-de.txt && \ + popd