概要

実装して理解するTF-IDF/BM25のスコアリング

対象

Elasticsearchで検索機能を実装(する|した)けどスコアリング周りがふわっとしている人

内容

  1. 各種アルゴリズムの実装
  2. 実際のスコアリングロジック
  3. 参考文献

書かないこと

  • 全文検索とは何か
  • 転置indexのデータ構造 / tokenizerの詳細

あくまでもどうスコアリングされるか肌感を掴むのがメイン

詳細

1. 各種アルゴリズムの実装

TF-IDF/BM25はどちらも高校数学レベルで理解できる

TF = Term Frequency

Term Frequencyは該当の単語が文書中に出現する頻度

= 該当の単語がどれだけその文書の特徴を代表するか

# ゆるキャン△ 1巻 説明文より引用
target = [
  "富士山", "が", "見える", "湖畔", "で", "キャンプ", "を", "する", "女の子", "リン", "自転車", "に", "乗り", "富士山", "を", "見に", "きた", "女の子", "なでしこ", "二人", "で", "カップラーメン", "を", "食べて", "見た","景色", "は","読めば", "キャンプ", "に", "行き", "たく", "なる","行か","なくて","も","行った","気分","に", "なる", "そんな", "新感覚", "キャンプ","マンガ","の", "登場", "です"
]

def calc_td(target, word)
  all_term_count = target.length
  match_term_count = target.map { |a| a == word ? 1 : 0 }.sum

  return match_term_count.to_f / all_term_count.to_f
end


words = ["キャンプ", "カップラーメン", "富士山", "本栖湖"]

words.each do |w|
  td =  calc_td(target, w)
  pp sprintf("td: %f word: %s", td, w)
end

# "td: 0.063830 word: キャンプ" x3
# "td: 0.021277 word: カップラーメン" x1
# "td: 0.042553 word: 富士山" x2
# "td: 0.000000 word: 本栖湖" x0

出現頻度を適切に反映したスコアになっている

IDF = Inverse Document Frequency

Inverse Document Frequencyは文書群で該当の単語が含まれる文書の頻度の逆数の対数

= 該当の単語が文書でどれだけ特異か

targets = []
# ゆるキャン△ 1巻 説明文より引用
targets[0] = [
  "富士山", "が", "見える", "湖畔", "で", "キャンプ", "を", "する", "女の子", "リン", "自転車", "に", "乗り", "富士山", "を", "見に", "きた", "女の子", "なでしこ", "二人", "で", "カップラーメン", "を", "食べて", "見た","景色", "は","読めば", "キャンプ", "に", "行き", "たく", "なる","行か","なくて","も","行った","気分","に", "なる", "そんな", "新感覚", "キャンプ","マンガ","の", "登場", "です"
]
# 恋は光 1巻 説明文より引用
targets[1] = [
	"恋", "の", "光", "が", "視えて", "しまう", "大学生", "西条", "は", "恋", "を", "探求", "する", "女の子", "東雲", "に", "恋", "を", "した", "視える", "から", "こそ","切なくて", "苦しい", "今", "まで", "に", "ない", "ラブストーリー", "が", "始まる"
]

def calc_idf(targets, word)
  all_doc_count = targets.length
  match_doc_count = 0
  targets.each do |target|
    if target.include?(word)
      match_doc_count = match_doc_count + 1
    end
  end

  return Math.log(all_doc_count.to_f / match_doc_count.to_f)
end

words = ["キャンプ", "恋", "女の子", "バトル"]

words.each do |w|
  idf =  calc_idf(targets, w)
  pp sprintf("idf: %f word: %s", idf, w)
end

# "idf: 0.693147 word: キャンプ" x1
# "idf: 0.693147 word: 恋" x1
# "idf: 0.000000 word: 女の子" x2
# "idf: Inf      word: バトル" x0
  • キャンプ / : 片方にしかないのでhigh score
  • 女の子: 両方あるのでlow score
  • バトル: どちらにもないのでInf

特異さが適切にスコアになっている

TF-IDF

TF-IDF はTFとIDFの積

targets = {}
# ゆるキャン△ 1巻 説明文より引用
targets["yurucam"] = [
  "富士山", "が", "見える", "湖畔", "で", "キャンプ", "を", "する", "女の子", "リン", "自転車", "に", "乗り", "富士山", "を", "見に", "きた", "女の子", "なでしこ", "二人", "で", "カップラーメン", "を", "食べて", "見た","景色", "は","読めば", "キャンプ", "に", "行き", "たく", "なる","行か","なくて","も","行った","気分","に", "なる", "そんな", "新感覚", "キャンプ","マンガ","の", "登場", "です"
]
# 恋は光 1巻 説明文より引用
targets["koihika"] = [
  "恋", "の", "光", "が", "視えて", "しまう", "大学生", "西条", "は", "恋", "を", "探求", "する", "女の子", "東雲", "に", "恋", "を", "した", "視える", "から", "こそ","切なくて", "苦しい", "今", "まで", "に", "ない", "ラブストーリー", "が", "始まる"
]

def calc_td(target, word)
  all_term_count = target.length
  match_term_count = target.map { |a| a == word ? 1 : 0 }.sum

  return match_term_count.to_f / all_term_count.to_f
end

def calc_idf(targets, word)
  all_doc_count = targets.length
  match_doc_count = 0
  targets.each do |target|
    if target.include?(word)
      match_doc_count = match_doc_count + 1
    end
  end

  return Math.log(all_doc_count.to_f / match_doc_count.to_f)
end

words = ["キャンプ", "恋", "カップラーメン", "富士山", "大学生", "女の子", "バトル"]

words.each do |w|
  idf = calc_idf(targets.values, w)
  targets.each do |name, target|
    td = calc_td(target, w)
    tf_idf = td * idf
    pp sprintf("target: %s tf-idf: %f word: %s", name, tf_idf, w)
  end
end

# "target: yurucam tf-idf: 0.044243 word: キャンプ"
# "target: koihika tf-idf: 0.000000 word: キャンプ"
# "target: yurucam tf-idf: 0.000000 word: 恋"
# "target: koihika tf-idf: 0.067079 word: 恋"

# "target: yurucam tf-idf: 0.014748 word: カップラーメン"
# "target: koihika tf-idf: 0.000000 word: カップラーメン"
# "target: yurucam tf-idf: 0.029496 word: 富士山"
# "target: koihika tf-idf: 0.000000 word: 富士山"
# "target: yurucam tf-idf: 0.000000 word: 大学生"
# "target: koihika tf-idf: 0.022360 word: 大学生"

# "target: yurucam tf-idf: 0.000000 word: 女の子"
# "target: koihika tf-idf: 0.000000 word: 女の子"
# "target: yurucam tf-idf: NaN      word: バトル"
# "target: koihika tf-idf: NaN      word: バトル"

TFとIDFの組み合わせによって主題に近いかどうかも適切にスコアリングされている

BM25の実装

BM25はTF-IDFの改良版でTFを総単語数(DL)で別パラメータに置き換え

targets = {}
# ゆるキャン△ 1巻 説明文より引用
targets["yurucam"] = [
  "富士山", "が", "見える", "湖畔", "で", "キャンプ", "を", "する", "女の子", "リン", "自転車", "に", "乗り", "富士山", "を", "見に", "きた", "女の子", "なでしこ", "二人", "で", "カップラーメン", "を", "食べて", "見た","景色", "は","読めば", "キャンプ", "に", "行き", "たく", "なる","行か","なくて","も","行った","気分","に", "なる", "そんな", "新感覚", "キャンプ","マンガ","の", "登場", "です"
]
# 恋は光 1巻 説明文より引用
targets["koihika"] = [
  "恋", "の", "光", "が", "視えて", "しまう", "大学生", "西条", "は", "恋", "を", "探求", "する", "女の子", "東雲", "に", "恋", "を", "した", "視える", "から", "こそ","切なくて", "苦しい", "今", "まで", "に", "ない", "ラブストーリー", "が", "始まる"
]

def calc_idf(targets, word)
  all_doc_count = targets.length
  match_doc_count = 0
  targets.each do |target|
    if target.include?(word)
      match_doc_count = match_doc_count + 1
    end
  end

  return Math.log(all_doc_count.to_f / match_doc_count.to_f)
end

def calc_bm25(target, word, avgdl, idf)
  k = 1.2
  b = 0.75
  ds = target.length

  def f(word, target)
    count = target.map { |a| a == word ? 1 : 0 }.sum
    return Math.sqrt(count) # for Elasticsearch
  end

  # この式だけは直感的な理解が難しい
  # cf. 参考文献
  return = idf * ((k + 1) * f(word, target)) / (f(word, target) + k * (1 - b + b * (ds / avgdl)))
end


words = ["キャンプ", "恋", "カップラーメン", "富士山", "大学生", "女の子", "バトル"]

words.each do |w|
  idf =  calc_idf(targets.values, w)
  avgdl = targets.map {|x| x.length}.sum / targets.length

  targets.sort.to_h each do |name, target|
    bm25 = calc_bm25(target, w, avgdl, idf)
    pp sprintf("target: %s bm25: %f word: %s", name, bm25, w)
  end
end

# "target: koihika bm25: 0.000000 word: キャンプ"
# "target: yurucam bm25: 0.116190 word: キャンプ"
# "target: koihika bm25: 0.170051 word: 恋"
# "target: yurucam bm25: 0.000000 word: 恋"

# "target: koihika bm25: 0.000000 word: カップラーメン"
# "target: yurucam bm25: 0.069315 word: カップラーメン"
# "target: koihika bm25: 0.000000 word: 富士山"
# "target: yurucam bm25: 0.096214 word: 富士山"
# "target: koihika bm25: 0.103035 word: 大学生"
# "target: yurucam bm25: 0.000000 word: 大学生"

# "target: koihika bm25: 0.000000 word: 女の子"
# "target: yurucam bm25: 0.000000 word: 女の子"
# "target: koihika bm25: NaN word: バトル"
# "target: yurucam bm25: NaN word: バトル"

TF-IDFより出現頻度の影響が少なめ

BM25の利点として出現頻度に1->1000など差があるケースでスコアが安定する(らしい)

2. 実際のElasticsearchのスコアリングロジック

  • BM25の k+1boost2.2
  • IDF周りの式が微妙に違う
  • 単語のカウントがずれる

などはあるがほぼ同様のロジックで計算されていることが分かる

{
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.28758648,
    "hits" : [
      {
        "_id" : "4",
        "_score" : 0.28758648,
        "_source" : {
          "analyzer" : "kuromoji",
          "description" : "「恋の光」が視えてしまう大学生・西条は、恋を探求する女の子・東雲に恋をした──。視えるからこそ切なくて苦しい。今までにないラブストーリーが始まる。"
        },
        "_explanation" : {
          "value" : 0.28758648,
          "description" : "weight(description:恋 in 4) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 0.28758648,
              "description" : "score(freq=3.0), computed as boost * idf * tf from:",
              "details" : [
                {
                  "value" : 2.2,
                  "description" : "boost",
                  "details" : [ ]
                },
                {
                  "value" : 0.18232156,
                  "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 2,
                      "description" : "n, number of documents containing term",
                      "details" : [ ]
                    },
                    {
                      "value" : 2,
                      "description" : "N, total number of documents with field",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 0.7169812,
                  "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                  "details" : [
                    {
                      "value" : 3.0,
                      "description" : "freq, occurrences of term within document",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "k1, term saturation parameter",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "b, length normalization parameter",
                      "details" : [ ]
                    },
                    {
                      "value" : 56.0,
                      "description" : "dl, length of field (approximate)",
                      "details" : [ ]
                    },
                    {
                      "value" : 57.0,
                      "description" : "avgdl, average length of field",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }
    ]
  }
}

参考文献