FM-index と同様に、ほぼ最適化をしていない素朴な実装を作ってみました。
元の論文の実装は空間効率まで考慮して書かれていてすごい! ……のですが、まずはそれ以外の部分の仕組みを理解したかったので、以下のコードではそこらへんも無視しています。
(※ 2016-03-17 に書いた記事のクロスポストです)
require "minitest/autorun"
SENTINEL = "$"
TYPE_L = "L"
TYPE_S = "S"
NAME_CHARS = [
"a","b","c","d","e","f","g","h","i","j",
"k","l","m","n","o","p","q","r","s","t",
"u","v","w","x","y","z"
]
def _assert(exp)
raise "must not happen" if exp == false
end
# from <= i <= to
def _each_up(from, to)
( from.upto(to) ).each {|i| yield(i) }
end
# from >= i >= to
def _each_down(from, to)
( from.downto(to) ).each {|i| yield(i) }
end
class Buckets
def initialize(s_cs)
@ary = []
@freq_map = make_freq_map(s_cs)
@sorted_uniq_chars = @freq_map.keys().sort()
clear(s_cs.length())
end
def clear(length)
_each_up(0, length - 1) {|i|
@ary[i] = nil
}
end
def make_freq_map(s_cs)
uniq_cs = s_cs.uniq()
# 初期化
freq_map = {}
uniq_cs.each {|c| freq_map[c] = 0 }
s_cs.each {|c| freq_map[c] += 1 }
return freq_map
end
def first_pos(c)
i = 0
found = false
@sorted_uniq_chars.each {|iter_c|
if (iter_c == c)
found = true
break
end
i += num_bucket_elements(iter_c)
}
return i
end
def num_bucket_elements(c)
return @freq_map[c]
end
def add_l(c, si)
# バケツの先頭・末尾
bia = first_pos(c)
biz = bia + num_bucket_elements(c) - 1
i = nil
# 線形走査。遅い。
_each_up(bia, biz) {|bi|
if @ary[bi] != nil
next
end
i = bi
break
}
_assert( i != nil )
@ary[i] = si
end
def add_s(c, si)
# バケツの先頭・末尾
bia = first_pos(c)
biz = bia + num_bucket_elements(c) - 1
i = nil
# 線形走査。遅い。
_each_down(biz, bia) {|bi|
if @ary[bi] != nil
next
end
i = bi
break
}
_assert( i != nil )
@ary[i] = si
end
def num_chars()
return @ary.length()
end
def get(i)
return @ary[i]
end
def set(i, value)
@ary[i] = value
end
def to_array()
return @ary
end
end
def make_types(s_cs)
types = []
types[s_cs.length() - 1] = TYPE_S # sentinel は S
prev_type = TYPE_S
prev_c = SENTINEL
_each_down(s_cs.length() - 2, 0) {|si|
c = s_cs[si]
if (c < prev_c)
type = TYPE_S
elsif (c > prev_c)
type = TYPE_L
else
type = prev_type
end
types[si] = type
prev_c = c
prev_type = type
}
return types
end
def extract_lms_positions(types)
sis = []
prev_type = TYPE_S # sentinel
_each_down(types.length() - 2, 0) {|si|
type = types[si]
if (type == TYPE_L && prev_type == TYPE_S)
sis.unshift(si + 1)
end
prev_type = type
}
return sis
end
def add_lms_to_bkts(bkts, s_cs, lms_sis)
lms_sis.each {|lms_si|
c = s_cs[lms_si]
bkts.add_s(c, lms_si)
}
return bkts
end
def induce_l(bkts, s_cs, types)
# 上から
_each_up(0, bkts.num_chars() - 1) {|bi|
si = bkts.get(bi)
next if si == nil
next if si == 0
next if types[si - 1] != TYPE_L
prev_c = s_cs[si - 1]
bkts.add_l(prev_c, si - 1)
}
return bkts
end
def induce_s(bkts, s_cs, types)
# 下から
_each_down(bkts.num_chars() - 1, 0) {|bi|
si = bkts.get(bi)
next if si == nil
next if si == 0
next if types[si - 1] != TYPE_S
prev_c = s_cs[si - 1]
bkts.add_s(prev_c, si - 1)
}
return bkts
end
def is_lms(lms_sis, si)
return lms_sis.include?(si)
end
def induced_sort(bkts, s_cs, types, lms_sis)
bkts = induce_l(bkts, s_cs, types)
# sentinel 以外の LMS を除去
_each_up(1, bkts.num_chars() - 1) {|bi|
si = bkts.get(bi)
bkts.set(bi, nil) if is_lms(lms_sis, si)
}
bkts = induce_s(bkts, s_cs, types)
return bkts
end
def is_same_substring(s_cs, ai, bi, lms_sis)
i = 0
is_same = true
while true
if s_cs[ai + i] != s_cs[bi + i]
is_same = false
break
end
if i >= 2
# 元論文ではタイプ(L/S)による判別も行なっている
a_is_lms = is_lms(lms_sis, ai + i)
b_is_lms = is_lms(lms_sis, bi + i)
if (a_is_lms && b_is_lms)
break
elsif (! a_is_lms && b_is_lms)
is_same = false
break
elsif (a_is_lms && ! b_is_lms)
is_same = false
break
else
# both are not LMS
end
end
i += 1
_assert(i < s_cs.length())
end
return is_same
end
def get_name(i)
name = NAME_CHARS[i]
if (name == nil)
raise "names (LMS-substring) is too many"
end
return name
end
def to_names(s_cs, lms_sis, sorted_lms_sis_temp)
is_unique = true
# name index
ni = 0
names = []
# 1個目
names.unshift(get_name(ni))
ni += 1
# 2個目以降
_each_up(0, sorted_lms_sis_temp.length() - 2) {|ai|
sia = sorted_lms_sis_temp[ai]
sib = sorted_lms_sis_temp[ai + 1]
if (is_same_substring(s_cs, sia, sib, lms_sis))
is_unique = false
else
ni += 1
end
names.unshift(get_name(ni))
}
return [names, is_unique]
end
def sa_is(s_cs)
bkts = Buckets.new(s_cs)
types = make_types(s_cs)
lms_sis = extract_lms_positions(types)
# --------------------------------
# induced sort 1回目
# LMS-substring をソートするのが目的
bkts = add_lms_to_bkts(bkts, s_cs, lms_sis)
bkts = induced_sort(bkts, s_cs, types, lms_sis)
# この時点で LMS-substring がソートされた状態になる
# (ただし、重複した LMS-substring 同士の順序は未確定)
# --------------------------------
# LMS-substring のソート
# LMS だけを抜き出す
sorted_lms_sis_temp = []
_each_up(0, bkts.num_chars() - 1) {|bi|
si = bkts.get(bi)
if (is_lms(lms_sis, si))
sorted_lms_sis_temp << si
end
}
names, is_unique = to_names(s_cs, lms_sis, sorted_lms_sis_temp)
sorted_lms_sis = nil
if (is_unique)
sorted_lms_sis = sorted_lms_sis_temp
else
ret = sa_is(names)
sorted_lms_sis = []
ret.each {|i|
sorted_lms_sis.unshift(lms_sis[i])
}
end
# --------------------------------
# induced sort 2回目
# 1回目のソートは LMS-substring のソート結果を得るのが目的だったので
# 一旦空にして良い。
bkts.clear(s_cs.length())
bkts = add_lms_to_bkts(bkts, s_cs, sorted_lms_sis)
bkts = induced_sort(bkts, s_cs, types, sorted_lms_sis)
return bkts.to_array()
end
class SaIsTest < Minitest::Test
def test_make_lms_sis_1
# 0 1
assert_equal([1], extract_lms_positions(["L","S"]))
end
def test_make_lms_sis_2
# 0 1 2
assert_equal([2], extract_lms_positions(["S","L","S"]))
end
def test_make_lms_sis_3
# 0 1 2 3
assert_equal [2], extract_lms_positions(["S","L","S","S"])
end
def test_make_lms_sis_4
# 0 1 2 3 4 5
assert_equal([2, 5], extract_lms_positions(["S","L","S","L","L","S"]))
end
def test_is_same_substring_1
# 0 1 2 3 4 5 6 7
# L S L S L S L S
t_cs = ["b","a","b","a","b","a","b","$"]
lms_sis = [1,3,5,7]
assert_equal true, is_same_substring(t_cs, 1, 3, lms_sis)
end
def test_is_same_substring_2
# 0 1 2 3 4 5 6 7
# L S L S L S L S
s_cs = ["b","a","c","a","b","a","b","$"]
lms_sis = [1,3,5,7]
assert_equal false, is_same_substring(s_cs, 1, 3, lms_sis)
end
def test_sa_is_bbaaddaaddaaccaa
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
# L L S S L L S S L L S S L L L L S
s_cs = ["b","b","a","a","d","d","a","a","d","d","a","a","c","c","a","a", SENTINEL]
assert_equal(
[16 ,15 ,14 ,10 ,6 ,2 ,11 ,7 ,3 ,1 ,0 ,13 ,12 ,9 ,5 ,8 ,4],
sa_is(s_cs)
)
end
def test_sa_is_eaefaegaefaag
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1 2 3
# L S S L S S L S S L S S L S
s_cs = ["e","a","e","f","a","e","g","a","e","f","a","a","g", SENTINEL]
assert_equal(
[13,10,7,1,4,11,0,8,2,5,9,3,12,6],
sa_is(s_cs)
)
end
def test_sa_is_mississippi
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1
# L S L L S L L S L L L S
s_cs = ["m","i","s","s","i","s","s","i","p","p","i", SENTINEL]
assert_equal(
[11,10,7,4,1,0,9,8,6,3,5,2],
sa_is(s_cs)
)
end
def test_sa_is_abracadabra
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1
# S S L S L S L S S L L S
s_cs = ["a","b","r","a","c","a","d","a","b","r","a", SENTINEL]
assert_equal(
[11,10,7,0,3,5,8,1,4,6,9,2],
sa_is(s_cs)
)
end
end
(追記 2021-02-14) Ruby 3.0.0 向けに minitest まわりを微修正しました。
Crieitは誰でも投稿できるサービスです。 是非記事の投稿をお願いします。どんな軽い内容でも投稿できます。
また、「こんな記事が読みたいけど見つからない!」という方は是非記事投稿リクエストボードへ!
こじんまりと作業ログやメモ、進捗を書き残しておきたい方はボード機能をご利用ください。
ボードとは?
コメント