Update elasticsearch

- Build our own elasticsearch image based on 6.8.6 which includes configuration files for german decompounder from https://github.com/uschindler/german-decompounder - move elastichq to separate service and use the official image instead of including it as plugin in the elasticsearch image - Adjust compounder filter config in papers to use the included hyphenation_decompounder. See https://www.elastic.co/guide/en/elasticsearch/reference/6.8/analysis-compound-word-tokenfilter.html - Remove obsolete "string" type in the mapping of papers, and replace it with "text" or "keywords" accordingly. See https://www.elastic.co/blog/strings-are-dead-long-live-strings
2025-07-14 16:31:33 +02:00 · 2020-01-16 11:20:03 +01:00 · 2020-01-16 11:20:03 +01:00 · dc3a97d3b8
commit dc3a97d3b8
parent 5dad333655
5 changed files with 80 additions and 62 deletions
--- a/app/models/paper.rb
+++ b/app/models/paper.rb
@ -21,43 +21,47 @@ class Paper < ActiveRecord::Base

  index_name ['srm', Rails.env, self.base_class.to_s.pluralize.underscore].join('_')

-  settings index: { 
-	  number_of_shards: 1,
-	  analysis: {
-		filter: {
-		  german_stop: {
-		    type: "stop",
-		    stopwords: "_german_"
-          }, 
-		  german_stemmer: {
-		    type: "stemmer",
-		    language: "light_german"
-          },
-		  decomp: {
-		    type: "decompound"
-          }
-        }, 
-		analyzer: {
-		  german: {
-		    tokenizer: "standard",
-		    filter: [
-		      "lowercase",
-		      "german_stop",
-		      "german_normalization",
-		      "german_stemmer",
-		      "decomp"
-		    ]
-          }
+  settings index: {
+    number_of_shards: 1,
+    analysis: {
+      filter: {
+        german_stop: {
+          type: "stop",
+          stopwords: "_german_"
+        },
+        german_stemmer: {
+          type: "stemmer",
+          language: "light_german"
+        },
+        german_decompounder: {
+          type: "hyphenation_decompounder",
+          word_list_path: "analysis/dictionary-de.txt",
+          hyphenation_patterns_path: "analysis/de_DR.xml",
+          only_longest_match: true,
+          min_subword_size: 4
+        },
+      },
+      analyzer: {
+        german: {
+          tokenizer: "standard",
+          filter: [
+            "lowercase",
+            "german_stop",
+              "german_decompounder",
+            "german_normalization",
+            "german_stemmer"
+          ]
        }
      }
+    }
    } do mappings dynamic: false do
-      indexes :name, type: :string, analyzer: "german"
-      indexes :content, type: :string, analyzer: "german"
-      indexes :resolution, type: :string, analyzer: "german"
-      indexes :reference, type: :string, index: :not_analyzed
-      indexes :paper_type, type: :string, index: :not_analyzed
-      indexes :published_at, type: :date, index: :not_analyzed
-      indexes :originator, type: :string, index: :not_analyzed
+      indexes :name, type: :text, analyzer: "german"
+      indexes :content, type: :text, analyzer: "german"
+      indexes :resolution, type: :text, analyzer: "german"
+      indexes :reference, type: :keyword, index: true
+      indexes :paper_type, type: :keyword, index: true
+      indexes :published_at, type: :date, index: true
+      indexes :originator, type: :keyword, index: true
    end
  end