tag:crieit.net,2005:https://crieit.net/tags/%E3%83%87%E3%83%BC%E3%82%BF%E6%A7%8B%E9%80%A0/feed
「データ構造」の記事 - Crieit
Crieitでタグ「データ構造」に投稿された最近の記事
2021-02-14T18:50:36+09:00
https://crieit.net/tags/%E3%83%87%E3%83%BC%E3%82%BF%E6%A7%8B%E9%80%A0/feed
tag:crieit.net,2005:PublicArticle/16685
2021-02-14T18:24:50+09:00
2021-02-14T18:50:36+09:00
https://crieit.net/posts/Ruby-SA-IS-suffix-array-induced-sorting
Rubyで素朴なSA-IS(suffix array induced sorting)を書いてみた
<p><a target="_blank" rel="nofollow noopener" href="https://memo88.hatenablog.com/entry/20160303/1457007566">FM-index</a> と同様に、ほぼ最適化をしていない素朴な実装を作ってみました。</p>
<p>元の論文の実装は空間効率まで考慮して書かれていてすごい! ……のですが、まずはそれ以外の部分の仕組みを理解したかったので、以下のコードではそこらへんも無視しています。</p>
<p>(※ <a target="_blank" rel="nofollow noopener" href="https://memo88.hatenablog.com/entry/20160317/1458224137">2016-03-17 に書いた記事</a>のクロスポストです)</p>
<h1 id="コード"><a href="#%E3%82%B3%E3%83%BC%E3%83%89">コード</a></h1>
<pre><code class="ruby">require "minitest/autorun"
SENTINEL = "$"
TYPE_L = "L"
TYPE_S = "S"
NAME_CHARS = [
"a","b","c","d","e","f","g","h","i","j",
"k","l","m","n","o","p","q","r","s","t",
"u","v","w","x","y","z"
]
def _assert(exp)
raise "must not happen" if exp == false
end
# from <= i <= to
def _each_up(from, to)
( from.upto(to) ).each {|i| yield(i) }
end
# from >= i >= to
def _each_down(from, to)
( from.downto(to) ).each {|i| yield(i) }
end
class Buckets
def initialize(s_cs)
@ary = []
@freq_map = make_freq_map(s_cs)
@sorted_uniq_chars = @freq_map.keys().sort()
clear(s_cs.length())
end
def clear(length)
_each_up(0, length - 1) {|i|
@ary[i] = nil
}
end
def make_freq_map(s_cs)
uniq_cs = s_cs.uniq()
# 初期化
freq_map = {}
uniq_cs.each {|c| freq_map[c] = 0 }
s_cs.each {|c| freq_map[c] += 1 }
return freq_map
end
def first_pos(c)
i = 0
found = false
@sorted_uniq_chars.each {|iter_c|
if (iter_c == c)
found = true
break
end
i += num_bucket_elements(iter_c)
}
return i
end
def num_bucket_elements(c)
return @freq_map[c]
end
def add_l(c, si)
# バケツの先頭・末尾
bia = first_pos(c)
biz = bia + num_bucket_elements(c) - 1
i = nil
# 線形走査。遅い。
_each_up(bia, biz) {|bi|
if @ary[bi] != nil
next
end
i = bi
break
}
_assert( i != nil )
@ary[i] = si
end
def add_s(c, si)
# バケツの先頭・末尾
bia = first_pos(c)
biz = bia + num_bucket_elements(c) - 1
i = nil
# 線形走査。遅い。
_each_down(biz, bia) {|bi|
if @ary[bi] != nil
next
end
i = bi
break
}
_assert( i != nil )
@ary[i] = si
end
def num_chars()
return @ary.length()
end
def get(i)
return @ary[i]
end
def set(i, value)
@ary[i] = value
end
def to_array()
return @ary
end
end
def make_types(s_cs)
types = []
types[s_cs.length() - 1] = TYPE_S # sentinel は S
prev_type = TYPE_S
prev_c = SENTINEL
_each_down(s_cs.length() - 2, 0) {|si|
c = s_cs[si]
if (c < prev_c)
type = TYPE_S
elsif (c > prev_c)
type = TYPE_L
else
type = prev_type
end
types[si] = type
prev_c = c
prev_type = type
}
return types
end
def extract_lms_positions(types)
sis = []
prev_type = TYPE_S # sentinel
_each_down(types.length() - 2, 0) {|si|
type = types[si]
if (type == TYPE_L && prev_type == TYPE_S)
sis.unshift(si + 1)
end
prev_type = type
}
return sis
end
def add_lms_to_bkts(bkts, s_cs, lms_sis)
lms_sis.each {|lms_si|
c = s_cs[lms_si]
bkts.add_s(c, lms_si)
}
return bkts
end
def induce_l(bkts, s_cs, types)
# 上から
_each_up(0, bkts.num_chars() - 1) {|bi|
si = bkts.get(bi)
next if si == nil
next if si == 0
next if types[si - 1] != TYPE_L
prev_c = s_cs[si - 1]
bkts.add_l(prev_c, si - 1)
}
return bkts
end
def induce_s(bkts, s_cs, types)
# 下から
_each_down(bkts.num_chars() - 1, 0) {|bi|
si = bkts.get(bi)
next if si == nil
next if si == 0
next if types[si - 1] != TYPE_S
prev_c = s_cs[si - 1]
bkts.add_s(prev_c, si - 1)
}
return bkts
end
def is_lms(lms_sis, si)
return lms_sis.include?(si)
end
def induced_sort(bkts, s_cs, types, lms_sis)
bkts = induce_l(bkts, s_cs, types)
# sentinel 以外の LMS を除去
_each_up(1, bkts.num_chars() - 1) {|bi|
si = bkts.get(bi)
bkts.set(bi, nil) if is_lms(lms_sis, si)
}
bkts = induce_s(bkts, s_cs, types)
return bkts
end
def is_same_substring(s_cs, ai, bi, lms_sis)
i = 0
is_same = true
while true
if s_cs[ai + i] != s_cs[bi + i]
is_same = false
break
end
if i >= 2
# 元論文ではタイプ(L/S)による判別も行なっている
a_is_lms = is_lms(lms_sis, ai + i)
b_is_lms = is_lms(lms_sis, bi + i)
if (a_is_lms && b_is_lms)
break
elsif (! a_is_lms && b_is_lms)
is_same = false
break
elsif (a_is_lms && ! b_is_lms)
is_same = false
break
else
# both are not LMS
end
end
i += 1
_assert(i < s_cs.length())
end
return is_same
end
def get_name(i)
name = NAME_CHARS[i]
if (name == nil)
raise "names (LMS-substring) is too many"
end
return name
end
def to_names(s_cs, lms_sis, sorted_lms_sis_temp)
is_unique = true
# name index
ni = 0
names = []
# 1個目
names.unshift(get_name(ni))
ni += 1
# 2個目以降
_each_up(0, sorted_lms_sis_temp.length() - 2) {|ai|
sia = sorted_lms_sis_temp[ai]
sib = sorted_lms_sis_temp[ai + 1]
if (is_same_substring(s_cs, sia, sib, lms_sis))
is_unique = false
else
ni += 1
end
names.unshift(get_name(ni))
}
return [names, is_unique]
end
def sa_is(s_cs)
bkts = Buckets.new(s_cs)
types = make_types(s_cs)
lms_sis = extract_lms_positions(types)
# --------------------------------
# induced sort 1回目
# LMS-substring をソートするのが目的
bkts = add_lms_to_bkts(bkts, s_cs, lms_sis)
bkts = induced_sort(bkts, s_cs, types, lms_sis)
# この時点で LMS-substring がソートされた状態になる
# (ただし、重複した LMS-substring 同士の順序は未確定)
# --------------------------------
# LMS-substring のソート
# LMS だけを抜き出す
sorted_lms_sis_temp = []
_each_up(0, bkts.num_chars() - 1) {|bi|
si = bkts.get(bi)
if (is_lms(lms_sis, si))
sorted_lms_sis_temp << si
end
}
names, is_unique = to_names(s_cs, lms_sis, sorted_lms_sis_temp)
sorted_lms_sis = nil
if (is_unique)
sorted_lms_sis = sorted_lms_sis_temp
else
ret = sa_is(names)
sorted_lms_sis = []
ret.each {|i|
sorted_lms_sis.unshift(lms_sis[i])
}
end
# --------------------------------
# induced sort 2回目
# 1回目のソートは LMS-substring のソート結果を得るのが目的だったので
# 一旦空にして良い。
bkts.clear(s_cs.length())
bkts = add_lms_to_bkts(bkts, s_cs, sorted_lms_sis)
bkts = induced_sort(bkts, s_cs, types, sorted_lms_sis)
return bkts.to_array()
end
class SaIsTest < Minitest::Test
def test_make_lms_sis_1
# 0 1
assert_equal([1], extract_lms_positions(["L","S"]))
end
def test_make_lms_sis_2
# 0 1 2
assert_equal([2], extract_lms_positions(["S","L","S"]))
end
def test_make_lms_sis_3
# 0 1 2 3
assert_equal [2], extract_lms_positions(["S","L","S","S"])
end
def test_make_lms_sis_4
# 0 1 2 3 4 5
assert_equal([2, 5], extract_lms_positions(["S","L","S","L","L","S"]))
end
def test_is_same_substring_1
# 0 1 2 3 4 5 6 7
# L S L S L S L S
t_cs = ["b","a","b","a","b","a","b","$"]
lms_sis = [1,3,5,7]
assert_equal true, is_same_substring(t_cs, 1, 3, lms_sis)
end
def test_is_same_substring_2
# 0 1 2 3 4 5 6 7
# L S L S L S L S
s_cs = ["b","a","c","a","b","a","b","$"]
lms_sis = [1,3,5,7]
assert_equal false, is_same_substring(s_cs, 1, 3, lms_sis)
end
def test_sa_is_bbaaddaaddaaccaa
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
# L L S S L L S S L L S S L L L L S
s_cs = ["b","b","a","a","d","d","a","a","d","d","a","a","c","c","a","a", SENTINEL]
assert_equal(
[16 ,15 ,14 ,10 ,6 ,2 ,11 ,7 ,3 ,1 ,0 ,13 ,12 ,9 ,5 ,8 ,4],
sa_is(s_cs)
)
end
def test_sa_is_eaefaegaefaag
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1 2 3
# L S S L S S L S S L S S L S
s_cs = ["e","a","e","f","a","e","g","a","e","f","a","a","g", SENTINEL]
assert_equal(
[13,10,7,1,4,11,0,8,2,5,9,3,12,6],
sa_is(s_cs)
)
end
def test_sa_is_mississippi
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1
# L S L L S L L S L L L S
s_cs = ["m","i","s","s","i","s","s","i","p","p","i", SENTINEL]
assert_equal(
[11,10,7,4,1,0,9,8,6,3,5,2],
sa_is(s_cs)
)
end
def test_sa_is_abracadabra
# 0 1
# 0 1 2 3 4 5 6 7 8 9 0 1
# S S L S L S L S S L L S
s_cs = ["a","b","r","a","c","a","d","a","b","r","a", SENTINEL]
assert_equal(
[11,10,7,0,3,5,8,1,4,6,9,2],
sa_is(s_cs)
)
end
end
</code></pre>
<p>(追記 2021-02-14) Ruby 3.0.0 向けに minitest まわりを微修正しました。</p>
<h1 id="参考"><a href="#%E5%8F%82%E8%80%83">参考</a></h1>
<ul>
<li>2018-01-30 <a target="_blank" rel="nofollow noopener" href="https://mametter.hatenablog.com/entry/20180130/p1">SA-IS 法のメモ - まめめも</a>
<ul>
<li>Ruby による実装</li>
</ul></li>
</ul>
sonota486
tag:crieit.net,2005:PublicArticle/16678
2021-02-08T18:51:49+09:00
2021-02-08T18:55:50+09:00
https://crieit.net/posts/Ruby-FM-index
Rubyで素朴なFM-indexを書いてみた
<p>BWT、検索処理の最適化・高速化は行なっていません(SA-IS、ウェーブレット行列などは使っていません)。 BWT から検索まで全体の流れが見渡せる最小限の実装にしました。 せっかくなので Ruby に馴染みのない方が見ても読みやすいと思われる書き方にしています(returnやメソッド呼び出しの括弧を省略しない、など)。</p>
<p>(※ <a target="_blank" rel="nofollow noopener" href="https://memo88.hatenablog.com/entry/20160303/1457007566">2016-03-03 に書いた記事</a>のクロスポストです)</p>
<h1 id="参考にしたもの"><a href="#%E5%8F%82%E8%80%83%E3%81%AB%E3%81%97%E3%81%9F%E3%82%82%E3%81%AE">参考にしたもの</a></h1>
<ul>
<li><a target="_blank" rel="nofollow noopener" href="http://www.langmead-lab.org/teaching-materials/">Teaching Materials</a> - ここに置いてある「Burrows-Wheeler Transform and FM Index」というタイトルの PDF(ジョンズ・ホプキンス大学 Ben Langmead さんの講義資料)</li>
</ul>
<p>最初の取っ掛かりとして分りやすかったのがこれ。要点を押さえた簡潔な図と Python コードを眺めてるだけでだいぶ分かった気になれます。</p>
<hr />
<ul>
<li><a target="_blank" rel="nofollow noopener" href="https://github.com/erukiti/cerebrums">erukiti/cerebrums: 文章・情報共有ソフト</a></li>
</ul>
<p>もっと具体的なところについては実装を見た方が早いということで <a target="_blank" rel="nofollow noopener" href="http://qiita.com/erukiti/items/f11f448d3f4d73fbc1f9">ハクビシンにもわかる全文検索</a> の erukiti さんの実装を参考にさせてもらいました。CoffeeScript 製。</p>
<h1 id="コード"><a href="#%E3%82%B3%E3%83%BC%E3%83%89">コード</a></h1>
<pre><code>- range
- (a..b) => [a, b] b is included
- (a...b) => [a, b) b is not included
- a.downto(b) => [a, a-1, ... b+1, b]
- method { |x| ... } => in JavaScript: method((x)=>{ ... })
</code></pre>
<pre><code class="ruby">require "minitest/autorun"
SENTINEL = "$"
def to_bwm(t)
tt = t + SENTINEL + t
rows = (0..t.length).map { |i|
tt[i..(i + t.length)]
}
return rows.sort
end
# Burrows-Wheeler transform
def bwt(t)
bwm = to_bwm(t)
return bwm.map { |cs| cs[-1] }.join("")
end
def rank_less_than(cs, c)
return cs.count { |_c| _c < c }
end
def rank(cs, c, i)
return (0...i).count { |j| cs[j] == c }
end
# LF mapping
def map_lf(cs, c, i)
return rank_less_than(cs, c) + rank(cs, c, i)
end
# String before c
def backward_chars(bwt_t, c, s, e)
bwt_cs = bwt_t.split("") # to array of chars
cb = c # T(i)
ca = nil # T(i-1)
result = ""
while true
s = map_lf(bwt_cs, cb, s)
e = map_lf(bwt_cs, cb, e)
# assert e - s == 1
ca = bwt_t[s]
if (ca == SENTINEL)
break
end
result += ca
cb = ca
end
return result.reverse
end
def reverse(bwt_t)
# sentinel は F列では必ず 0 行目のみに存在する
# sentinel is always at F[0]
s = 0 # start
e = 1 # end
return backward_chars(bwt_t, SENTINEL, s, e)
end
def search_internal(bwt_t, q)
bwt_cs = bwt_t.split("") # to array of chars
s = 0
e = bwt_t.length
(q.length - 1).downto(0).each { |i|
s = map_lf(bwt_cs, q[i], s)
e = map_lf(bwt_cs, q[i], e)
if (s >= e)
return nil
end
}
return [s, e]
end
def search(bwt_t, q)
s, e = search_internal(bwt_t, q)
return s == nil ? 0 : (e - s)
end
class FmIndexTest < Minitest::Test
def setup
t = "abaaba"
@bwt_t = bwt(t)
end
def test_bwt
assert_equal("abba$aa", @bwt_t)
end
# 1文字の検索
def test_one_char
s, e = search_internal(@bwt_t, "a")
assert_equal(1, s)
assert_equal(5, e)
# 出現回数
num_hits = search(@bwt_t, "a")
assert_equal(4, num_hits)
end
# 2回出現する
def test_2_hits
s, e = search_internal(@bwt_t, "aba")
assert_equal(3, s)
assert_equal(5, e)
assert_equal(2, search(@bwt_t, "aba"))
end
# 1回出現する
def test_one_hit
s, e = search_internal(@bwt_t, "aaba")
assert_equal(2, s)
assert_equal(3, e)
assert_equal(1, search(@bwt_t, "aaba"))
end
# 存在しない組み合わせ
def non_existent_combination
s, e = search_internal(@bwt_t, "baba")
assert_equal(nil, s)
assert_equal(nil, e)
assert_equal(0, search(@bwt_t, "baba"))
end
# 存在しない文字
def non_existent_char
s, e = search_internal(@bwt_t, "x")
assert_equal(nil, s)
assert_equal(nil, e)
assert_equal(0, search(@bwt_t, "x"))
end
def test_reverse
bwt_t = bwt("mississippi")
assert_equal("ipssm$pissii", bwt_t)
assert_equal("mississippi", reverse(bwt_t))
end
end
</code></pre>
<ul>
<li>(追記 2021-02-08) Ruby 3.0.0 向けに minitest まわりを微修正しました。</li>
</ul>
sonota486
tag:crieit.net,2005:PublicArticle/15090
2019-06-11T11:13:41+09:00
2019-06-11T11:13:41+09:00
https://crieit.net/posts/Python-List
Pythonで学ぶ データ構造入門 List編
<h2 id="TL;DR"><a href="#TL%3BDR">TL;DR</a></h2>
<p>データ構造の基本であるList(LinkedList)やHashMap、Queue、Dequeを自分で実装して理解を深めようという趣旨でやっていきます。</p>
<h2 id="まずはLinkedListを作ってみる"><a href="#%E3%81%BE%E3%81%9A%E3%81%AFLinkedList%E3%82%92%E4%BD%9C%E3%81%A3%E3%81%A6%E3%81%BF%E3%82%8B">まずはLinkedListを作ってみる</a></h2>
<p>LinkedListとは、リストの各々の要素に次の要素への参照をつけておくことで一連のデータを表現できるデータ構造です。</p>
<p><img src="https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/209421/e496a785-37db-f4cf-444d-cdec5ba94fe2.png" alt="data structure.png" /></p>
<p>このように、それぞれの要素が次への参照とデータを持っています。<br />
これをPythonで実装してみます。<br />
まずはそれぞれの要素のクラスです。</p>
<p>```python:要素<br />
class Element:<br />
"""<br />
Element(data, next)</p>
<pre><code>LinkedListのそれぞれの要素のクラス。
dataはこの要素が表すデータ。
nextはこの次の要素の参照。
"""
def __init__(self, data, next=None):
self.data = data
self.next = next
</code></pre>
<pre><code><br />次にリスト本体を実装してみます。
```python:LinkedList
class LinkedList:
def __init__(self):
self.first = None
@property
def is_empty(self):
return self.first is None
def append(self, data):
if self.is_empty:
self.first = Element(data)
return
nxt = self.first
while nxt.next is not None:
nxt = nxt.next
nxt.next = Element(data)
def pop(self):
if self.is_empty:
raise ValueError
nxt = self.first
while nxt.next is not None:
nxt = nxt.next
# 最後の要素のデータを一時変数に退避
last = nxt.next.data
# 最後の要素への参照を消す
nxt.next = None
return last
def remove(self, idx):
# 最初ならself.firstを変更
if idx == 0:
f = self.first
self.first = f.next
return f.data
size = len(self)
if idx >= size or idx < 0:
raise IndexError(idx)
if self.is_empty:
raise ValueError
# 最後ならpop
if idx == size - 1:
return self.pop()
nxt = self.first
for _ in range(idx - 1):
nxt = nxt.next
rem = nxt.next
nxt.next = rem.next
return rem.data
def insert(self, idx, data):
# 最初ならself.firstを変更
if idx == 0:
self.first = Element(data, self.first)
return
size = len(self)
if idx > size or idx < 0:
raise IndexError(idx)
# 最後+1ならappend
if idx == size:
self.append(data)
return
nxt = self.first
for _ in range(idx):
nxt = nxt.next
nxt.next = Element(data, nxt.next)
def __iter__(self):
nxt = self.first
while nxt.next is not None:
yield nxt.data
nxt = nxt.next
def __len__(self):
if self.is_empty:
return 0
nxt = self.first
ret = 1
while nxt.next is not None:
nxt = nxt.next
ret += 1
return ret
def __getitem__(self, idx):
if idx >= len(self) or idx < 0:
raise IndexError(idx)
if self.is_empty:
raise ValueError
nxt = self.first
for _ in range(idx):
nxt = nxt.next
return nxt.data
def __setitem__(self, idx, val):
if idx >= len(self) or idx < 0:
raise IndexError(idx)
if self.is_empty:
raise ValueError
nxt = self.first
for _ in range(idx):
nxt = nxt.next
nxt.data = val
</code></pre>
<p>こんな感じですね。<br />
これが一番単純なLinkedListの実装です。<br />
ですけど、結構非効率的ですね。</p>
<p>例えば、一番後ろのデータを取得するために最初から全部辿っています。<br />
データにアクセスするためにすべて前からアクセスするのでは、後半のデータにアクセスする際に非効率になってしまいます。<br />
これを解決するのが双方向リストというものです。</p>
<p>(次回があれば)これを双方向リストに改造してみたいと思います。<br />
今日のところはここまでにしましょう。それがいい。</p>
frodo821