From 700184f7767ab0520dd9cedbf0cbc0c158b6a0d1 Mon Sep 17 00:00:00 2001 From: Pete Matsyburka Date: Mon, 9 Jun 2025 09:55:30 +0300 Subject: [PATCH] add ngram --- app/models/search_entry.rb | 4 + ...0250608163157_add_ngram_to_search_index.rb | 14 +++ db/schema.rb | 6 +- lib/search_entries.rb | 98 +++++++++++++++---- lib/submissions.rb | 2 +- lib/submitters.rb | 17 ++-- 6 files changed, 114 insertions(+), 27 deletions(-) create mode 100644 db/migrate/20250608163157_add_ngram_to_search_index.rb diff --git a/app/models/search_entry.rb b/app/models/search_entry.rb index bc641669..dca2db05 100644 --- a/app/models/search_entry.rb +++ b/app/models/search_entry.rb @@ -5,6 +5,7 @@ # Table name: search_entries # # id :bigint not null, primary key +# ngram :tsvector # record_type :string not null # tsvector :tsvector not null # created_at :datetime not null @@ -14,6 +15,9 @@ # # Indexes # +# index_search_entries_on_account_id_ngram_submission (account_id,ngram) WHERE ((record_type)::text = 'Submission'::text) USING gin +# index_search_entries_on_account_id_ngram_submitter (account_id,ngram) WHERE ((record_type)::text = 'Submitter'::text) USING gin +# index_search_entries_on_account_id_ngram_template (account_id,ngram) WHERE ((record_type)::text = 'Template'::text) USING gin # index_search_entries_on_account_id_tsvector_submission (account_id,tsvector) WHERE ((record_type)::text = 'Submission'::text) USING gin # index_search_entries_on_account_id_tsvector_submitter (account_id,tsvector) WHERE ((record_type)::text = 'Submitter'::text) USING gin # index_search_entries_on_account_id_tsvector_template (account_id,tsvector) WHERE ((record_type)::text = 'Template'::text) USING gin diff --git a/db/migrate/20250608163157_add_ngram_to_search_index.rb b/db/migrate/20250608163157_add_ngram_to_search_index.rb new file mode 100644 index 00000000..8cdb049f --- /dev/null +++ b/db/migrate/20250608163157_add_ngram_to_search_index.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +class AddNgramToSearchIndex < ActiveRecord::Migration[8.0] + def change + add_column :search_entries, :ngram, :tsvector + + add_index :search_entries, %i[account_id ngram], using: :gin, where: "record_type = 'Submitter'", + name: 'index_search_entries_on_account_id_ngram_submitter' + add_index :search_entries, %i[account_id ngram], using: :gin, where: "record_type = 'Submission'", + name: 'index_search_entries_on_account_id_ngram_submission' + add_index :search_entries, %i[account_id ngram], using: :gin, where: "record_type = 'Template'", + name: 'index_search_entries_on_account_id_ngram_template' + end +end diff --git a/db/schema.rb b/db/schema.rb index ca4801e4..f0ecc90e 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_06_03_105556) do +ActiveRecord::Schema[8.0].define(version: 2025_06_08_163157) do # These are extensions that must be enabled in order to support this database enable_extension "btree_gin" enable_extension "plpgsql" @@ -264,6 +264,10 @@ ActiveRecord::Schema[8.0].define(version: 2025_06_03_105556) do t.tsvector "tsvector", null: false t.datetime "created_at", null: false t.datetime "updated_at", null: false + t.tsvector "ngram" + t.index ["account_id", "ngram"], name: "index_search_entries_on_account_id_ngram_submission", where: "((record_type)::text = 'Submission'::text)", using: :gin + t.index ["account_id", "ngram"], name: "index_search_entries_on_account_id_ngram_submitter", where: "((record_type)::text = 'Submitter'::text)", using: :gin + t.index ["account_id", "ngram"], name: "index_search_entries_on_account_id_ngram_template", where: "((record_type)::text = 'Template'::text)", using: :gin t.index ["account_id", "tsvector"], name: "index_search_entries_on_account_id_tsvector_submission", where: "((record_type)::text = 'Submission'::text)", using: :gin t.index ["account_id", "tsvector"], name: "index_search_entries_on_account_id_tsvector_submitter", where: "((record_type)::text = 'Submitter'::text)", using: :gin t.index ["account_id", "tsvector"], name: "index_search_entries_on_account_id_tsvector_template", where: "((record_type)::text = 'Template'::text)", using: :gin diff --git a/lib/search_entries.rb b/lib/search_entries.rb index 270bba70..efad8a39 100644 --- a/lib/search_entries.rb +++ b/lib/search_entries.rb @@ -42,21 +42,43 @@ module SearchEntries end end - def build_tsquery(keyword) + def build_tsquery(keyword, with_or_vector: false) keyword = keyword.delete("\0") if keyword.match?(/\d/) && !keyword.match?(/\p{L}/) number = keyword.gsub(/\D/, '') - ["tsvector @@ ((quote_literal(?) || ':*')::tsquery || (quote_literal(?) || ':*')::tsquery || plainto_tsquery(?))", - number, number.length > 1 ? number.delete_prefix('0') : number, keyword] + sql = + if number.length <= 2 + <<~SQL.squish + ngram @@ (quote_literal(?)::tsquery || quote_literal(?)::tsquery) OR tsvector @@ plainto_tsquery(?) + SQL + else + <<~SQL.squish + tsvector @@ ((quote_literal(?) || ':*')::tsquery || (quote_literal(?) || ':*')::tsquery || plainto_tsquery(?)) + SQL + end + + [sql, number, number.length > 1 ? number.delete_prefix('0') : number, keyword] elsif keyword.match?(/[^\p{L}\d&@._\-+]/) || keyword.match?(/\A['"].*['"]\z/) ['tsvector @@ plainto_tsquery(?)', TextUtils.transliterate(keyword.downcase)] else - [ - "tsvector @@ (quote_literal(coalesce((ts_lexize('english_stem', :keyword))[1], :keyword)) || ':*')::tsquery", - { keyword: TextUtils.transliterate(keyword.downcase).squish } - ] + keyword = TextUtils.transliterate(keyword.downcase).squish + + sql = + if keyword.length <= 2 + arel = Arel.sql(<<~SQL.squish) + ngram @@ quote_literal(:keyword)::tsquery + SQL + + arel = Arel::Nodes::Or.new([arel, Arel.sql('tsvector @@ plainto_tsquery(:keyword)')]).to_sql if with_or_vector + + arel + else + "tsvector @@ (quote_literal(coalesce((ts_lexize('english_stem', :keyword))[1], :keyword)) || ':*')::tsquery" + end + + [sql, { keyword: }] end end @@ -78,25 +100,51 @@ module SearchEntries ["tsvector @@ (#{query.to_sql})", terms.index_by.with_index { |_, index| :"term#{index}" }.merge(weight:)] end + def build_weights_wildcard_tsquery(keyword, weight) + keyword = TextUtils.transliterate(keyword.downcase).squish + + sql = + if keyword.length <= 2 + <<~SQL.squish + ngram @@ (quote_literal(:keyword) || ':' || :weight)::tsquery + SQL + else + <<~SQL.squish + tsvector @@ (quote_literal(coalesce((ts_lexize('english_stem', :keyword))[1], :keyword)) || ':*' || :weight)::tsquery + SQL + end + + [sql, { keyword:, weight: }] + end + def index_submitter(submitter) return if submitter.email.blank? && submitter.phone.blank? && submitter.name.blank? + email_phone_name = [ + [submitter.email.to_s, submitter.email.to_s.split('@').last].join(' ').delete("\0"), + [submitter.phone.to_s.gsub(/\D/, ''), + submitter.phone.to_s.gsub(PhoneCodes::REGEXP, '').gsub(/\D/, '')].uniq.join(' ').delete("\0"), + TextUtils.transliterate(submitter.name).delete("\0") + ] + sql = SearchEntry.sanitize_sql_array( [ "SELECT setweight(to_tsvector(?), 'A') || setweight(to_tsvector(?), 'B') || - setweight(to_tsvector(?), 'C') || setweight(to_tsvector(?), 'D')".squish, - [submitter.email.to_s, submitter.email.to_s.split('@').last].join(' ').downcase.delete("\0"), - [submitter.phone.to_s.gsub(/\D/, ''), - submitter.phone.to_s.gsub(PhoneCodes::REGEXP, '').gsub(/\D/, '')].uniq.join(' ').delete("\0"), - TextUtils.transliterate(submitter.name.to_s.downcase).delete("\0"), - build_submitter_values_string(submitter) + setweight(to_tsvector(?), 'C') || setweight(to_tsvector(?), 'D') as tsvector, + setweight(to_tsvector('simple', ?), 'A') || + setweight(to_tsvector('simple', ?), 'B') || + setweight(to_tsvector('simple', ?), 'C') as ngram".squish, + *email_phone_name, + build_submitter_values_string(submitter), + *email_phone_name ] ) entry = submitter.search_entry || submitter.build_search_entry entry.account_id = submitter.account_id - entry.tsvector = SearchEntry.connection.select_value(sql) + entry.tsvector, ngram = SearchEntry.connection.select_rows(sql).first + entry.ngram = build_ngram(ngram) return if entry.tsvector.blank? @@ -122,13 +170,15 @@ module SearchEntries def index_template(template) sql = SearchEntry.sanitize_sql_array( - ['SELECT to_tsvector(?)', TextUtils.transliterate(template.name.to_s.downcase).delete("\0")] + ["SELECT to_tsvector(:text), to_tsvector('simple', :text)", + { text: TextUtils.transliterate(template.name.to_s.downcase).delete("\0") }] ) entry = template.search_entry || template.build_search_entry entry.account_id = template.account_id - entry.tsvector = SearchEntry.connection.select_value(sql) + entry.tsvector, ngram = SearchEntry.connection.select_rows(sql).first + entry.ngram = build_ngram(ngram) return if entry.tsvector.blank? @@ -145,13 +195,15 @@ module SearchEntries return if submission.name.blank? sql = SearchEntry.sanitize_sql_array( - ['SELECT to_tsvector(?)', TextUtils.transliterate(submission.name.to_s.downcase).delete("\0")] + ["SELECT to_tsvector(:text), to_tsvector('simple', :text)", + { text: TextUtils.transliterate(submission.name.to_s.downcase).delete("\0") }] ) entry = submission.search_entry || submission.build_search_entry entry.account_id = submission.account_id - entry.tsvector = SearchEntry.connection.select_value(sql) + entry.tsvector, ngram = SearchEntry.connection.select_rows(sql).first + entry.ngram = build_ngram(ngram) return if entry.tsvector.blank? @@ -163,4 +215,14 @@ module SearchEntries retry end + + def build_ngram(ngram) + ngrams = + ngram.split(/\s(?=')/).each_with_object([]) do |item, acc| + acc << item.sub(/'(.*?)':/) { "'#{Regexp.last_match(1).first(2)}':" } + acc << item.sub(/'(.*?)':/) { "'#{Regexp.last_match(1).first(1)}':" } + end + + ngrams.uniq { |e| e.sub(/':[\d,]/, "':1") }.join(' ') + end end diff --git a/lib/submissions.rb b/lib/submissions.rb index e6e3d6ab..2ba84731 100644 --- a/lib/submissions.rb +++ b/lib/submissions.rb @@ -60,7 +60,7 @@ module Submissions arel = Arel::Nodes::Union.new( arel, Submitter.joins(:search_entry) .where(search_entry: { account_id: current_user.account_id }) - .where(*SearchEntries.build_tsquery(keyword)) + .where(*SearchEntries.build_tsquery(keyword, with_or_vector: true)) .select(:submission_id).arel ) diff --git a/lib/submitters.rb b/lib/submitters.rb index fa07076a..6e7d856d 100644 --- a/lib/submitters.rb +++ b/lib/submitters.rb @@ -45,21 +45,24 @@ module Submitters if keyword.match?(/\d/) && !keyword.match?(/\p{L}/) number = keyword.gsub(/\D/, '') - ["tsvector @@ ((quote_literal(?) || ':*#{weight}')::tsquery || (quote_literal(?) || ':*#{weight}')::tsquery)", - number, number.length > 1 ? number.delete_prefix('0') : number] + sql = + if number.length <= 2 + "ngram @@ ((quote_literal(?) || ':' || ?)::tsquery || (quote_literal(?) || ':' || ?)::tsquery)" + else + "tsvector @@ ((quote_literal(?) || ':*' || ?)::tsquery || (quote_literal(?) || ':*' || ?)::tsquery)" + end + + [sql, number, weight, number.length > 1 ? number.delete_prefix('0') : number, weight] elsif keyword.match?(/[^\p{L}\d&@._\-+]/) terms = TextUtils.transliterate(keyword.downcase).split(/\b/).map(&:squish).compact_blank.uniq if terms.size > 1 SearchEntries.build_weights_tsquery(terms, weight) else - [ - SearchEntries::FIELD_SEARCH_QUERY_SQL, - { keyword: TextUtils.transliterate(keyword.downcase).squish, weight: } - ] + SearchEntries.build_weights_wildcard_tsquery(keyword, weight) end else - [SearchEntries::FIELD_SEARCH_QUERY_SQL, { keyword: TextUtils.transliterate(keyword.downcase).squish, weight: }] + SearchEntries.build_weights_wildcard_tsquery(keyword, weight) end submitters.where(