「データ構造」の記事 - Crieit

Rubyで素朴なSA-IS（suffix array induced sorting）を書いてみた

2021-02-14T18:24:50+09:00

FM-index と同様に、ほぼ最適化をしていない素朴な実装を作ってみました。

元の論文の実装は空間効率まで考慮して書かれていてすごい！ ……のですが、まずはそれ以外の部分の仕組みを理解したかったので、以下のコードではそこらへんも無視しています。

（※ 2016-03-17 に書いた記事のクロスポストです）

コード

require "minitest/autorun"

SENTINEL = "$"

TYPE_L = "L"
TYPE_S = "S"

NAME_CHARS = [
  "a","b","c","d","e","f","g","h","i","j",
  "k","l","m","n","o","p","q","r","s","t",
  "u","v","w","x","y","z"
]

def _assert(exp)
  raise "must not happen" if exp == false
end

# from <= i <= to
def _each_up(from, to)
  ( from.upto(to) ).each {|i| yield(i) }
end

# from >= i >= to
def _each_down(from, to)
  ( from.downto(to) ).each {|i| yield(i) }
end


class Buckets

  def initialize(s_cs)
    @ary = []
    @freq_map = make_freq_map(s_cs)
    @sorted_uniq_chars = @freq_map.keys().sort()

    clear(s_cs.length())
  end

  def clear(length)
    _each_up(0, length - 1) {|i|
      @ary[i] = nil
    }
  end

  def make_freq_map(s_cs)
    uniq_cs = s_cs.uniq()

    # 初期化
    freq_map = {}
    uniq_cs.each {|c| freq_map[c] = 0 }

    s_cs.each {|c| freq_map[c] += 1 }
    return freq_map
  end

  def first_pos(c)
    i = 0
    found = false
    @sorted_uniq_chars.each {|iter_c|
      if (iter_c == c)
        found = true
        break
      end

      i += num_bucket_elements(iter_c)
    }
    return i
  end

  def num_bucket_elements(c)
    return @freq_map[c]
  end

  def add_l(c, si)
    # バケツの先頭・末尾
    bia = first_pos(c)
    biz = bia + num_bucket_elements(c) - 1

    i = nil
    # 線形走査。遅い。
    _each_up(bia, biz) {|bi|
      if @ary[bi] != nil
        next
      end
      i = bi
      break
    }
    _assert( i != nil )
    @ary[i] = si
  end

  def add_s(c, si)
    # バケツの先頭・末尾
    bia = first_pos(c)
    biz = bia + num_bucket_elements(c) - 1

    i = nil
    # 線形走査。遅い。
    _each_down(biz, bia) {|bi|
      if @ary[bi] != nil
        next
      end
      i = bi
      break
    }
    _assert( i != nil )
    @ary[i] = si
  end

  def num_chars()
    return @ary.length()
  end

  def get(i)
    return @ary[i]
  end

  def set(i, value)
    @ary[i] = value
  end

  def to_array()
    return @ary
  end
end


def make_types(s_cs)
  types = []
  types[s_cs.length() - 1] = TYPE_S # sentinel は S
  prev_type = TYPE_S
  prev_c = SENTINEL

  _each_down(s_cs.length() - 2, 0) {|si|
    c = s_cs[si]

    if (c < prev_c)
      type = TYPE_S
    elsif (c > prev_c)
      type = TYPE_L
    else
      type = prev_type
    end
    types[si] = type

    prev_c = c
    prev_type = type
  }
  return types
end

def extract_lms_positions(types)
  sis = []
  prev_type = TYPE_S # sentinel
  _each_down(types.length() - 2, 0) {|si|
    type = types[si]
    if (type == TYPE_L && prev_type == TYPE_S)
      sis.unshift(si + 1)
    end

    prev_type = type
  }
  return sis
end

def add_lms_to_bkts(bkts, s_cs, lms_sis)
  lms_sis.each {|lms_si|
    c = s_cs[lms_si]
    bkts.add_s(c, lms_si)
  }
  return bkts
end

def induce_l(bkts, s_cs, types)
  # 上から
  _each_up(0, bkts.num_chars() - 1) {|bi|
    si = bkts.get(bi)
    next if si == nil
    next if si == 0
    next if types[si - 1] != TYPE_L
    prev_c = s_cs[si - 1]
    bkts.add_l(prev_c, si - 1)
  }
  return bkts
end

def induce_s(bkts, s_cs, types)
  # 下から
  _each_down(bkts.num_chars() - 1, 0) {|bi|
    si = bkts.get(bi)
    next if si == nil
    next if si == 0
    next if types[si - 1] != TYPE_S
    prev_c = s_cs[si - 1]
    bkts.add_s(prev_c, si - 1)
  }
  return bkts
end

def is_lms(lms_sis, si)
  return lms_sis.include?(si)
end

def induced_sort(bkts, s_cs, types, lms_sis)
  bkts = induce_l(bkts, s_cs, types)

  # sentinel 以外の LMS を除去
  _each_up(1, bkts.num_chars() - 1) {|bi|
    si = bkts.get(bi)
    bkts.set(bi, nil) if is_lms(lms_sis, si)
  }

  bkts = induce_s(bkts, s_cs, types)

  return bkts
end

def is_same_substring(s_cs, ai, bi, lms_sis)
  i = 0
  is_same = true

  while true
    if s_cs[ai + i] != s_cs[bi + i]
      is_same = false
      break
    end

    if i >= 2
      # 元論文ではタイプ（L/S）による判別も行なっている
      a_is_lms = is_lms(lms_sis, ai + i)
      b_is_lms = is_lms(lms_sis, bi + i)
      if (a_is_lms && b_is_lms)
        break
      elsif (! a_is_lms && b_is_lms)
        is_same = false
        break
      elsif (a_is_lms && ! b_is_lms)
        is_same = false
        break
      else
        # both are not LMS
      end
    end

    i += 1
    _assert(i < s_cs.length())
  end

  return is_same
end

def get_name(i)
  name = NAME_CHARS[i]
  if (name == nil)
    raise "names (LMS-substring) is too many"
  end
  return name
end

def to_names(s_cs, lms_sis, sorted_lms_sis_temp)
  is_unique = true
  # name index
  ni = 0
  names = []

  # 1個目
  names.unshift(get_name(ni))
  ni += 1

  # 2個目以降
  _each_up(0, sorted_lms_sis_temp.length() - 2) {|ai|
    sia = sorted_lms_sis_temp[ai]
    sib = sorted_lms_sis_temp[ai + 1]

    if (is_same_substring(s_cs, sia, sib, lms_sis))
      is_unique = false
    else
      ni += 1
    end

    names.unshift(get_name(ni))
  }

  return [names, is_unique]
end

def sa_is(s_cs)
  bkts = Buckets.new(s_cs)
  types = make_types(s_cs)
  lms_sis = extract_lms_positions(types)

  # --------------------------------
  # induced sort 1回目
  # LMS-substring をソートするのが目的

  bkts = add_lms_to_bkts(bkts, s_cs, lms_sis)
  bkts = induced_sort(bkts, s_cs, types, lms_sis)

  # この時点で LMS-substring がソートされた状態になる
  # （ただし、重複した LMS-substring 同士の順序は未確定）

  # --------------------------------
  # LMS-substring のソート

  # LMS だけを抜き出す
  sorted_lms_sis_temp = []
  _each_up(0, bkts.num_chars() - 1) {|bi|
    si = bkts.get(bi)
    if (is_lms(lms_sis, si))
      sorted_lms_sis_temp << si
    end
  }

  names, is_unique = to_names(s_cs, lms_sis, sorted_lms_sis_temp)

  sorted_lms_sis = nil
  if (is_unique)
    sorted_lms_sis = sorted_lms_sis_temp
  else
    ret = sa_is(names)

    sorted_lms_sis = []
    ret.each {|i|
      sorted_lms_sis.unshift(lms_sis[i])
    }
  end

  # --------------------------------
  # induced sort 2回目

  # 1回目のソートは LMS-substring のソート結果を得るのが目的だったので
  # 一旦空にして良い。
  bkts.clear(s_cs.length())

  bkts = add_lms_to_bkts(bkts, s_cs, sorted_lms_sis)
  bkts = induced_sort(bkts, s_cs, types, sorted_lms_sis)

  return bkts.to_array()
end


class SaIsTest < Minitest::Test

  def test_make_lms_sis_1
    #                                         0   1
    assert_equal([1], extract_lms_positions(["L","S"]))
  end

  def test_make_lms_sis_2
    #                                         0   1   2
    assert_equal([2], extract_lms_positions(["S","L","S"]))
  end

  def test_make_lms_sis_3
    #                                         0   1   2   3
    assert_equal [2], extract_lms_positions(["S","L","S","S"])
  end

  def test_make_lms_sis_4
    #                                            0   1   2   3   4   5
    assert_equal([2, 5], extract_lms_positions(["S","L","S","L","L","S"]))
  end


  def test_is_same_substring_1
    #        0   1   2   3   4   5   6   7
    #        L   S   L   S   L   S   L   S
    t_cs = ["b","a","b","a","b","a","b","$"]
    lms_sis = [1,3,5,7]
    assert_equal true, is_same_substring(t_cs, 1, 3, lms_sis)
  end

  def test_is_same_substring_2
    #        0   1   2   3   4   5   6   7
    #        L   S   L   S   L   S   L   S
    s_cs = ["b","a","c","a","b","a","b","$"]
    lms_sis = [1,3,5,7]
    assert_equal false, is_same_substring(s_cs, 1, 3, lms_sis)
  end


  def test_sa_is_bbaaddaaddaaccaa
    #        0                                       1
    #        0   1   2   3   4   5   6   7   8   9   0   1   2   3   4   5   6
    #        L   L   S   S   L   L   S   S   L   L   S   S   L   L   L   L   S
    s_cs = ["b","b","a","a","d","d","a","a","d","d","a","a","c","c","a","a", SENTINEL]

    assert_equal(
      [16 ,15 ,14 ,10 ,6 ,2 ,11 ,7 ,3 ,1 ,0 ,13 ,12 ,9 ,5 ,8 ,4],
      sa_is(s_cs)
    )
  end

  def test_sa_is_eaefaegaefaag
    #        0                                       1
    #        0   1   2   3   4   5   6   7   8   9   0   1   2   3
    #        L   S   S   L   S   S   L   S   S   L   S   S   L   S
    s_cs = ["e","a","e","f","a","e","g","a","e","f","a","a","g", SENTINEL]

    assert_equal(
      [13,10,7,1,4,11,0,8,2,5,9,3,12,6],
      sa_is(s_cs)
    )
  end

  def test_sa_is_mississippi
    #        0                                       1
    #        0   1   2   3   4   5   6   7   8   9   0   1
    #        L   S   L   L   S   L   L   S   L   L   L   S
    s_cs = ["m","i","s","s","i","s","s","i","p","p","i", SENTINEL]

    assert_equal(
      [11,10,7,4,1,0,9,8,6,3,5,2],
      sa_is(s_cs)
    )
  end

  def test_sa_is_abracadabra
    #        0                                       1
    #        0   1   2   3   4   5   6   7   8   9   0   1
    #        S   S   L   S   L   S   L   S   S   L   L   S
    s_cs = ["a","b","r","a","c","a","d","a","b","r","a", SENTINEL]

    assert_equal(
      [11,10,7,0,3,5,8,1,4,6,9,2],
      sa_is(s_cs)
    )
  end

end

（追記 2021-02-14） Ruby 3.0.0 向けに minitest まわりを微修正しました。

参考

2018-01-30 SA-IS 法のメモ - まめめも
- Ruby による実装

Rubyで素朴なFM-indexを書いてみた

2021-02-08T18:51:49+09:00

BWT、検索処理の最適化・高速化は行なっていません（SA-IS、ウェーブレット行列などは使っていません）。 BWT から検索まで全体の流れが見渡せる最小限の実装にしました。せっかくなので Ruby に馴染みのない方が見ても読みやすいと思われる書き方にしています（returnやメソッド呼び出しの括弧を省略しない、など）。

（※ 2016-03-03 に書いた記事のクロスポストです）

参考にしたもの

Teaching Materials - ここに置いてある「Burrows-Wheeler Transform and FM Index」というタイトルの PDF（ジョンズ・ホプキンス大学 Ben Langmead さんの講義資料）

最初の取っ掛かりとして分りやすかったのがこれ。要点を押さえた簡潔な図と Python コードを眺めてるだけでだいぶ分かった気になれます。

erukiti/cerebrums: 文章・情報共有ソフト

もっと具体的なところについては実装を見た方が早いということでハクビシンにもわかる全文検索の erukiti さんの実装を参考にさせてもらいました。CoffeeScript 製。

コード

- range
  - (a..b)  => [a, b] b is included
  - (a...b) => [a, b) b is not included
  - a.downto(b) => [a, a-1, ... b+1, b]
- method { |x| ... } => in JavaScript: method((x)=>{ ... })

require "minitest/autorun"

SENTINEL = "$"

def to_bwm(t)
  tt = t + SENTINEL + t
  rows = (0..t.length).map { |i|
    tt[i..(i + t.length)]
  }
  return rows.sort
end

# Burrows-Wheeler transform
def bwt(t)
  bwm = to_bwm(t)
  return bwm.map { |cs| cs[-1] }.join("")
end

def rank_less_than(cs, c)
  return cs.count { |_c| _c < c }
end

def rank(cs, c, i)
  return (0...i).count { |j| cs[j] == c }
end

# LF mapping
def map_lf(cs, c, i)
  return rank_less_than(cs, c) + rank(cs, c, i)
end

# String before c
def backward_chars(bwt_t, c, s, e)
  bwt_cs = bwt_t.split("") # to array of chars

  cb = c   # T(i)
  ca = nil # T(i-1)
  result = ""

  while true
    s = map_lf(bwt_cs, cb, s)
    e = map_lf(bwt_cs, cb, e)
    # assert e - s == 1
    ca = bwt_t[s]
    if (ca == SENTINEL)
      break
    end
    result += ca
    cb = ca
  end

  return result.reverse
end

def reverse(bwt_t)
  # sentinel は F列では必ず 0 行目のみに存在する
  # sentinel is always at F[0]
  s = 0 # start
  e = 1 # end

  return backward_chars(bwt_t, SENTINEL, s, e)
end

def search_internal(bwt_t, q)
  bwt_cs = bwt_t.split("") # to array of chars
  s = 0
  e = bwt_t.length

  (q.length - 1).downto(0).each { |i|
    s = map_lf(bwt_cs, q[i], s)
    e = map_lf(bwt_cs, q[i], e)
    if (s >= e)
      return nil
    end
  }

  return [s, e]
end

def search(bwt_t, q)
  s, e = search_internal(bwt_t, q)
  return s == nil ? 0 : (e - s)
end


class FmIndexTest < Minitest::Test

  def setup
    t = "abaaba"
    @bwt_t = bwt(t)
  end

  def test_bwt
    assert_equal("abba$aa", @bwt_t)
  end

  # 1文字の検索
  def test_one_char
    s, e = search_internal(@bwt_t, "a")
    assert_equal(1, s)
    assert_equal(5, e)

    # 出現回数
    num_hits = search(@bwt_t, "a")
    assert_equal(4, num_hits)
  end

  # 2回出現する
  def test_2_hits
    s, e = search_internal(@bwt_t, "aba")
    assert_equal(3, s)
    assert_equal(5, e)

    assert_equal(2, search(@bwt_t, "aba"))
  end

  # 1回出現する
  def test_one_hit
    s, e = search_internal(@bwt_t, "aaba")
    assert_equal(2, s)
    assert_equal(3, e)

    assert_equal(1, search(@bwt_t, "aaba"))
  end

  # 存在しない組み合わせ
  def non_existent_combination
    s, e = search_internal(@bwt_t, "baba")
    assert_equal(nil, s)
    assert_equal(nil, e)

    assert_equal(0, search(@bwt_t, "baba"))
  end

  # 存在しない文字
  def non_existent_char
    s, e = search_internal(@bwt_t, "x")
    assert_equal(nil, s)
    assert_equal(nil, e)

    assert_equal(0, search(@bwt_t, "x"))
  end

  def test_reverse
    bwt_t = bwt("mississippi")
    assert_equal("ipssm$pissii", bwt_t)
    assert_equal("mississippi", reverse(bwt_t))
  end

end

（追記 2021-02-08） Ruby 3.0.0 向けに minitest まわりを微修正しました。

Pythonで学ぶデータ構造入門 List編

2019-06-11T11:13:41+09:00

TL;DR

データ構造の基本であるList(LinkedList)やHashMap、Queue、Dequeを自分で実装して理解を深めようという趣旨でやっていきます。

まずはLinkedListを作ってみる

LinkedListとは、リストの各々の要素に次の要素への参照をつけておくことで一連のデータを表現できるデータ構造です。

このように、それぞれの要素が次への参照とデータを持っています。
これをPythonで実装してみます。
まずはそれぞれの要素のクラスです。

```python:要素
class Element:
"""
Element(data, next)

LinkedListのそれぞれの要素のクラス。
dataはこの要素が表すデータ。
nextはこの次の要素の参照。
"""
def __init__(self, data, next=None):
    self.data = data
    self.next = next


次にリスト本体を実装してみます。

```python:LinkedList
class LinkedList:
    def __init__(self):
        self.first = None

    @property
    def is_empty(self):
        return self.first is None

    def append(self, data):
        if self.is_empty:
            self.first = Element(data)
            return
        nxt = self.first
        while nxt.next is not None:
            nxt = nxt.next
        nxt.next = Element(data)

    def pop(self):
        if self.is_empty:
            raise ValueError

        nxt = self.first
        while nxt.next is not None:
            nxt = nxt.next

        # 最後の要素のデータを一時変数に退避
        last = nxt.next.data
        # 最後の要素への参照を消す
        nxt.next = None
        return last

    def remove(self, idx):
        # 最初ならself.firstを変更
        if idx == 0:
            f = self.first
            self.first = f.next
            return f.data

        size = len(self)
        if idx >= size or idx < 0:
            raise IndexError(idx)
        if self.is_empty:
            raise ValueError

        # 最後ならpop
        if idx == size - 1:
            return self.pop()

        nxt = self.first
        for _ in range(idx - 1):
            nxt = nxt.next
        rem = nxt.next
        nxt.next = rem.next
        return rem.data

    def insert(self, idx, data):
        # 最初ならself.firstを変更
        if idx == 0:
            self.first = Element(data, self.first)
            return

        size = len(self)
        if idx > size or idx < 0:
            raise IndexError(idx)

        # 最後+1ならappend
        if idx == size:
            self.append(data)
            return

        nxt = self.first
        for _ in range(idx):
            nxt = nxt.next
        nxt.next = Element(data, nxt.next)

    def __iter__(self):
        nxt = self.first
        while nxt.next is not None:
            yield nxt.data
            nxt = nxt.next

    def __len__(self):
        if self.is_empty:
            return 0
        nxt = self.first
        ret = 1
        while nxt.next is not None:
            nxt = nxt.next
            ret += 1
        return ret

    def __getitem__(self, idx):
        if idx >= len(self) or idx < 0:
            raise IndexError(idx)
        if self.is_empty:
            raise ValueError
        nxt = self.first
        for _ in range(idx):
            nxt = nxt.next
        return nxt.data

    def __setitem__(self, idx, val):
        if idx >= len(self) or idx < 0:
            raise IndexError(idx)
        if self.is_empty:
            raise ValueError
        nxt = self.first
        for _ in range(idx):
            nxt = nxt.next
        nxt.data = val

こんな感じですね。
これが一番単純なLinkedListの実装です。
ですけど、結構非効率的ですね。

例えば、一番後ろのデータを取得するために最初から全部辿っています。
データにアクセスするためにすべて前からアクセスするのでは、後半のデータにアクセスする際に非効率になってしまいます。
これを解決するのが双方向リストというものです。

(次回があれば)これを双方向リストに改造してみたいと思います。
今日のところはここまでにしましょう。それがいい。