tag:crieit.net,2005:https://crieit.net/tags/%E3%83%87%E3%83%BC%E3%82%BF%E6%A7%8B%E9%80%A0/feed 「データ構造」の記事 - Crieit Crieitでタグ「データ構造」に投稿された最近の記事 2021-02-14T18:50:36+09:00 https://crieit.net/tags/%E3%83%87%E3%83%BC%E3%82%BF%E6%A7%8B%E9%80%A0/feed tag:crieit.net,2005:PublicArticle/16685 2021-02-14T18:24:50+09:00 2021-02-14T18:50:36+09:00 https://crieit.net/posts/Ruby-SA-IS-suffix-array-induced-sorting Rubyで素朴なSA-IS(suffix array induced sorting)を書いてみた <p><a target="_blank" rel="nofollow noopener" href="https://memo88.hatenablog.com/entry/20160303/1457007566">FM-index</a> と同様に、ほぼ最適化をしていない素朴な実装を作ってみました。</p> <p>元の論文の実装は空間効率まで考慮して書かれていてすごい! ……のですが、まずはそれ以外の部分の仕組みを理解したかったので、以下のコードではそこらへんも無視しています。</p> <p>(※ <a target="_blank" rel="nofollow noopener" href="https://memo88.hatenablog.com/entry/20160317/1458224137">2016-03-17 に書いた記事</a>のクロスポストです)</p> <h1 id="コード"><a href="#%E3%82%B3%E3%83%BC%E3%83%89">コード</a></h1> <pre><code class="ruby">require "minitest/autorun" SENTINEL = "$" TYPE_L = "L" TYPE_S = "S" NAME_CHARS = [ "a","b","c","d","e","f","g","h","i","j", "k","l","m","n","o","p","q","r","s","t", "u","v","w","x","y","z" ] def _assert(exp) raise "must not happen" if exp == false end # from &lt;= i &lt;= to def _each_up(from, to) ( from.upto(to) ).each {|i| yield(i) } end # from &gt;= i &gt;= to def _each_down(from, to) ( from.downto(to) ).each {|i| yield(i) } end class Buckets def initialize(s_cs) @ary = [] @freq_map = make_freq_map(s_cs) @sorted_uniq_chars = @freq_map.keys().sort() clear(s_cs.length()) end def clear(length) _each_up(0, length - 1) {|i| @ary[i] = nil } end def make_freq_map(s_cs) uniq_cs = s_cs.uniq() # 初期化 freq_map = {} uniq_cs.each {|c| freq_map[c] = 0 } s_cs.each {|c| freq_map[c] += 1 } return freq_map end def first_pos(c) i = 0 found = false @sorted_uniq_chars.each {|iter_c| if (iter_c == c) found = true break end i += num_bucket_elements(iter_c) } return i end def num_bucket_elements(c) return @freq_map[c] end def add_l(c, si) # バケツの先頭・末尾 bia = first_pos(c) biz = bia + num_bucket_elements(c) - 1 i = nil # 線形走査。遅い。 _each_up(bia, biz) {|bi| if @ary[bi] != nil next end i = bi break } _assert( i != nil ) @ary[i] = si end def add_s(c, si) # バケツの先頭・末尾 bia = first_pos(c) biz = bia + num_bucket_elements(c) - 1 i = nil # 線形走査。遅い。 _each_down(biz, bia) {|bi| if @ary[bi] != nil next end i = bi break } _assert( i != nil ) @ary[i] = si end def num_chars() return @ary.length() end def get(i) return @ary[i] end def set(i, value) @ary[i] = value end def to_array() return @ary end end def make_types(s_cs) types = [] types[s_cs.length() - 1] = TYPE_S # sentinel は S prev_type = TYPE_S prev_c = SENTINEL _each_down(s_cs.length() - 2, 0) {|si| c = s_cs[si] if (c < prev_c) type = TYPE_S elsif (c > prev_c) type = TYPE_L else type = prev_type end types[si] = type prev_c = c prev_type = type } return types end def extract_lms_positions(types) sis = [] prev_type = TYPE_S # sentinel _each_down(types.length() - 2, 0) {|si| type = types[si] if (type == TYPE_L && prev_type == TYPE_S) sis.unshift(si + 1) end prev_type = type } return sis end def add_lms_to_bkts(bkts, s_cs, lms_sis) lms_sis.each {|lms_si| c = s_cs[lms_si] bkts.add_s(c, lms_si) } return bkts end def induce_l(bkts, s_cs, types) # 上から _each_up(0, bkts.num_chars() - 1) {|bi| si = bkts.get(bi) next if si == nil next if si == 0 next if types[si - 1] != TYPE_L prev_c = s_cs[si - 1] bkts.add_l(prev_c, si - 1) } return bkts end def induce_s(bkts, s_cs, types) # 下から _each_down(bkts.num_chars() - 1, 0) {|bi| si = bkts.get(bi) next if si == nil next if si == 0 next if types[si - 1] != TYPE_S prev_c = s_cs[si - 1] bkts.add_s(prev_c, si - 1) } return bkts end def is_lms(lms_sis, si) return lms_sis.include?(si) end def induced_sort(bkts, s_cs, types, lms_sis) bkts = induce_l(bkts, s_cs, types) # sentinel 以外の LMS を除去 _each_up(1, bkts.num_chars() - 1) {|bi| si = bkts.get(bi) bkts.set(bi, nil) if is_lms(lms_sis, si) } bkts = induce_s(bkts, s_cs, types) return bkts end def is_same_substring(s_cs, ai, bi, lms_sis) i = 0 is_same = true while true if s_cs[ai + i] != s_cs[bi + i] is_same = false break end if i >= 2 # 元論文ではタイプ(L/S)による判別も行なっている a_is_lms = is_lms(lms_sis, ai + i) b_is_lms = is_lms(lms_sis, bi + i) if (a_is_lms && b_is_lms) break elsif (! a_is_lms && b_is_lms) is_same = false break elsif (a_is_lms && ! b_is_lms) is_same = false break else # both are not LMS end end i += 1 _assert(i < s_cs.length()) end return is_same end def get_name(i) name = NAME_CHARS[i] if (name == nil) raise "names (LMS-substring) is too many" end return name end def to_names(s_cs, lms_sis, sorted_lms_sis_temp) is_unique = true # name index ni = 0 names = [] # 1個目 names.unshift(get_name(ni)) ni += 1 # 2個目以降 _each_up(0, sorted_lms_sis_temp.length() - 2) {|ai| sia = sorted_lms_sis_temp[ai] sib = sorted_lms_sis_temp[ai + 1] if (is_same_substring(s_cs, sia, sib, lms_sis)) is_unique = false else ni += 1 end names.unshift(get_name(ni)) } return [names, is_unique] end def sa_is(s_cs) bkts = Buckets.new(s_cs) types = make_types(s_cs) lms_sis = extract_lms_positions(types) # -------------------------------- # induced sort 1回目 # LMS-substring をソートするのが目的 bkts = add_lms_to_bkts(bkts, s_cs, lms_sis) bkts = induced_sort(bkts, s_cs, types, lms_sis) # この時点で LMS-substring がソートされた状態になる # (ただし、重複した LMS-substring 同士の順序は未確定) # -------------------------------- # LMS-substring のソート # LMS だけを抜き出す sorted_lms_sis_temp = [] _each_up(0, bkts.num_chars() - 1) {|bi| si = bkts.get(bi) if (is_lms(lms_sis, si)) sorted_lms_sis_temp << si end } names, is_unique = to_names(s_cs, lms_sis, sorted_lms_sis_temp) sorted_lms_sis = nil if (is_unique) sorted_lms_sis = sorted_lms_sis_temp else ret = sa_is(names) sorted_lms_sis = [] ret.each {|i| sorted_lms_sis.unshift(lms_sis[i]) } end # -------------------------------- # induced sort 2回目 # 1回目のソートは LMS-substring のソート結果を得るのが目的だったので # 一旦空にして良い。 bkts.clear(s_cs.length()) bkts = add_lms_to_bkts(bkts, s_cs, sorted_lms_sis) bkts = induced_sort(bkts, s_cs, types, sorted_lms_sis) return bkts.to_array() end class SaIsTest < Minitest::Test def test_make_lms_sis_1 # 0 1 assert_equal([1], extract_lms_positions(["L","S"])) end def test_make_lms_sis_2 # 0 1 2 assert_equal([2], extract_lms_positions(["S","L","S"])) end def test_make_lms_sis_3 # 0 1 2 3 assert_equal [2], extract_lms_positions(["S","L","S","S"]) end def test_make_lms_sis_4 # 0 1 2 3 4 5 assert_equal([2, 5], extract_lms_positions(["S","L","S","L","L","S"])) end def test_is_same_substring_1 # 0 1 2 3 4 5 6 7 # L S L S L S L S t_cs = ["b","a","b","a","b","a","b","$"] lms_sis = [1,3,5,7] assert_equal true, is_same_substring(t_cs, 1, 3, lms_sis) end def test_is_same_substring_2 # 0 1 2 3 4 5 6 7 # L S L S L S L S s_cs = ["b","a","c","a","b","a","b","$"] lms_sis = [1,3,5,7] assert_equal false, is_same_substring(s_cs, 1, 3, lms_sis) end def test_sa_is_bbaaddaaddaaccaa # 0 1 # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 # L L S S L L S S L L S S L L L L S s_cs = ["b","b","a","a","d","d","a","a","d","d","a","a","c","c","a","a", SENTINEL] assert_equal( [16 ,15 ,14 ,10 ,6 ,2 ,11 ,7 ,3 ,1 ,0 ,13 ,12 ,9 ,5 ,8 ,4], sa_is(s_cs) ) end def test_sa_is_eaefaegaefaag # 0 1 # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 # L S S L S S L S S L S S L S s_cs = ["e","a","e","f","a","e","g","a","e","f","a","a","g", SENTINEL] assert_equal( [13,10,7,1,4,11,0,8,2,5,9,3,12,6], sa_is(s_cs) ) end def test_sa_is_mississippi # 0 1 # 0 1 2 3 4 5 6 7 8 9 0 1 # L S L L S L L S L L L S s_cs = ["m","i","s","s","i","s","s","i","p","p","i", SENTINEL] assert_equal( [11,10,7,4,1,0,9,8,6,3,5,2], sa_is(s_cs) ) end def test_sa_is_abracadabra # 0 1 # 0 1 2 3 4 5 6 7 8 9 0 1 # S S L S L S L S S L L S s_cs = ["a","b","r","a","c","a","d","a","b","r","a", SENTINEL] assert_equal( [11,10,7,0,3,5,8,1,4,6,9,2], sa_is(s_cs) ) end end </code></pre> <p>(追記 2021-02-14) Ruby 3.0.0 向けに minitest まわりを微修正しました。</p> <h1 id="参考"><a href="#%E5%8F%82%E8%80%83">参考</a></h1> <ul> <li>2018-01-30 <a target="_blank" rel="nofollow noopener" href="https://mametter.hatenablog.com/entry/20180130/p1">SA-IS 法のメモ - まめめも</a> <ul> <li>Ruby による実装</li> </ul></li> </ul> sonota486 tag:crieit.net,2005:PublicArticle/16678 2021-02-08T18:51:49+09:00 2021-02-08T18:55:50+09:00 https://crieit.net/posts/Ruby-FM-index Rubyで素朴なFM-indexを書いてみた <p>BWT、検索処理の最適化・高速化は行なっていません(SA-IS、ウェーブレット行列などは使っていません)。 BWT から検索まで全体の流れが見渡せる最小限の実装にしました。 せっかくなので Ruby に馴染みのない方が見ても読みやすいと思われる書き方にしています(returnやメソッド呼び出しの括弧を省略しない、など)。</p> <p>(※ <a target="_blank" rel="nofollow noopener" href="https://memo88.hatenablog.com/entry/20160303/1457007566">2016-03-03 に書いた記事</a>のクロスポストです)</p> <h1 id="参考にしたもの"><a href="#%E5%8F%82%E8%80%83%E3%81%AB%E3%81%97%E3%81%9F%E3%82%82%E3%81%AE">参考にしたもの</a></h1> <ul> <li><a target="_blank" rel="nofollow noopener" href="http://www.langmead-lab.org/teaching-materials/">Teaching Materials</a> - ここに置いてある「Burrows-Wheeler Transform and FM Index」というタイトルの PDF(ジョンズ・ホプキンス大学 Ben Langmead さんの講義資料)</li> </ul> <p>最初の取っ掛かりとして分りやすかったのがこれ。要点を押さえた簡潔な図と Python コードを眺めてるだけでだいぶ分かった気になれます。</p> <hr /> <ul> <li><a target="_blank" rel="nofollow noopener" href="https://github.com/erukiti/cerebrums">erukiti/cerebrums: 文章・情報共有ソフト</a></li> </ul> <p>もっと具体的なところについては実装を見た方が早いということで <a target="_blank" rel="nofollow noopener" href="http://qiita.com/erukiti/items/f11f448d3f4d73fbc1f9">ハクビシンにもわかる全文検索</a> の erukiti さんの実装を参考にさせてもらいました。CoffeeScript 製。</p> <h1 id="コード"><a href="#%E3%82%B3%E3%83%BC%E3%83%89">コード</a></h1> <pre><code>- range - (a..b) => [a, b] b is included - (a...b) => [a, b) b is not included - a.downto(b) => [a, a-1, ... b+1, b] - method { |x| ... } => in JavaScript: method((x)=>{ ... }) </code></pre> <pre><code class="ruby">require "minitest/autorun" SENTINEL = "$" def to_bwm(t) tt = t + SENTINEL + t rows = (0..t.length).map { |i| tt[i..(i + t.length)] } return rows.sort end # Burrows-Wheeler transform def bwt(t) bwm = to_bwm(t) return bwm.map { |cs| cs[-1] }.join("") end def rank_less_than(cs, c) return cs.count { |_c| _c < c } end def rank(cs, c, i) return (0...i).count { |j| cs[j] == c } end # LF mapping def map_lf(cs, c, i) return rank_less_than(cs, c) + rank(cs, c, i) end # String before c def backward_chars(bwt_t, c, s, e) bwt_cs = bwt_t.split("") # to array of chars cb = c # T(i) ca = nil # T(i-1) result = "" while true s = map_lf(bwt_cs, cb, s) e = map_lf(bwt_cs, cb, e) # assert e - s == 1 ca = bwt_t[s] if (ca == SENTINEL) break end result += ca cb = ca end return result.reverse end def reverse(bwt_t) # sentinel は F列では必ず 0 行目のみに存在する # sentinel is always at F[0] s = 0 # start e = 1 # end return backward_chars(bwt_t, SENTINEL, s, e) end def search_internal(bwt_t, q) bwt_cs = bwt_t.split("") # to array of chars s = 0 e = bwt_t.length (q.length - 1).downto(0).each { |i| s = map_lf(bwt_cs, q[i], s) e = map_lf(bwt_cs, q[i], e) if (s >= e) return nil end } return [s, e] end def search(bwt_t, q) s, e = search_internal(bwt_t, q) return s == nil ? 0 : (e - s) end class FmIndexTest < Minitest::Test def setup t = "abaaba" @bwt_t = bwt(t) end def test_bwt assert_equal("abba$aa", @bwt_t) end # 1文字の検索 def test_one_char s, e = search_internal(@bwt_t, "a") assert_equal(1, s) assert_equal(5, e) # 出現回数 num_hits = search(@bwt_t, "a") assert_equal(4, num_hits) end # 2回出現する def test_2_hits s, e = search_internal(@bwt_t, "aba") assert_equal(3, s) assert_equal(5, e) assert_equal(2, search(@bwt_t, "aba")) end # 1回出現する def test_one_hit s, e = search_internal(@bwt_t, "aaba") assert_equal(2, s) assert_equal(3, e) assert_equal(1, search(@bwt_t, "aaba")) end # 存在しない組み合わせ def non_existent_combination s, e = search_internal(@bwt_t, "baba") assert_equal(nil, s) assert_equal(nil, e) assert_equal(0, search(@bwt_t, "baba")) end # 存在しない文字 def non_existent_char s, e = search_internal(@bwt_t, "x") assert_equal(nil, s) assert_equal(nil, e) assert_equal(0, search(@bwt_t, "x")) end def test_reverse bwt_t = bwt("mississippi") assert_equal("ipssm$pissii", bwt_t) assert_equal("mississippi", reverse(bwt_t)) end end </code></pre> <ul> <li>(追記 2021-02-08) Ruby 3.0.0 向けに minitest まわりを微修正しました。</li> </ul> sonota486 tag:crieit.net,2005:PublicArticle/15090 2019-06-11T11:13:41+09:00 2019-06-11T11:13:41+09:00 https://crieit.net/posts/Python-List Pythonで学ぶ データ構造入門 List編 <h2 id="TL;DR"><a href="#TL%3BDR">TL;DR</a></h2> <p>データ構造の基本であるList(LinkedList)やHashMap、Queue、Dequeを自分で実装して理解を深めようという趣旨でやっていきます。</p> <h2 id="まずはLinkedListを作ってみる"><a href="#%E3%81%BE%E3%81%9A%E3%81%AFLinkedList%E3%82%92%E4%BD%9C%E3%81%A3%E3%81%A6%E3%81%BF%E3%82%8B">まずはLinkedListを作ってみる</a></h2> <p>LinkedListとは、リストの各々の要素に次の要素への参照をつけておくことで一連のデータを表現できるデータ構造です。</p> <p><img src="https://qiita-image-store.s3.ap-northeast-1.amazonaws.com/0/209421/e496a785-37db-f4cf-444d-cdec5ba94fe2.png" alt="data structure.png" /></p> <p>このように、それぞれの要素が次への参照とデータを持っています。<br /> これをPythonで実装してみます。<br /> まずはそれぞれの要素のクラスです。</p> <p>```python:要素<br /> class Element:<br /> """<br /> Element(data, next)</p> <pre><code>LinkedListのそれぞれの要素のクラス。 dataはこの要素が表すデータ。 nextはこの次の要素の参照。 """ def __init__(self, data, next=None): self.data = data self.next = next </code></pre> <pre><code><br />次にリスト本体を実装してみます。 ```python:LinkedList class LinkedList: def __init__(self): self.first = None @property def is_empty(self): return self.first is None def append(self, data): if self.is_empty: self.first = Element(data) return nxt = self.first while nxt.next is not None: nxt = nxt.next nxt.next = Element(data) def pop(self): if self.is_empty: raise ValueError nxt = self.first while nxt.next is not None: nxt = nxt.next # 最後の要素のデータを一時変数に退避 last = nxt.next.data # 最後の要素への参照を消す nxt.next = None return last def remove(self, idx): # 最初ならself.firstを変更 if idx == 0: f = self.first self.first = f.next return f.data size = len(self) if idx >= size or idx < 0: raise IndexError(idx) if self.is_empty: raise ValueError # 最後ならpop if idx == size - 1: return self.pop() nxt = self.first for _ in range(idx - 1): nxt = nxt.next rem = nxt.next nxt.next = rem.next return rem.data def insert(self, idx, data): # 最初ならself.firstを変更 if idx == 0: self.first = Element(data, self.first) return size = len(self) if idx > size or idx < 0: raise IndexError(idx) # 最後+1ならappend if idx == size: self.append(data) return nxt = self.first for _ in range(idx): nxt = nxt.next nxt.next = Element(data, nxt.next) def __iter__(self): nxt = self.first while nxt.next is not None: yield nxt.data nxt = nxt.next def __len__(self): if self.is_empty: return 0 nxt = self.first ret = 1 while nxt.next is not None: nxt = nxt.next ret += 1 return ret def __getitem__(self, idx): if idx >= len(self) or idx < 0: raise IndexError(idx) if self.is_empty: raise ValueError nxt = self.first for _ in range(idx): nxt = nxt.next return nxt.data def __setitem__(self, idx, val): if idx >= len(self) or idx < 0: raise IndexError(idx) if self.is_empty: raise ValueError nxt = self.first for _ in range(idx): nxt = nxt.next nxt.data = val </code></pre> <p>こんな感じですね。<br /> これが一番単純なLinkedListの実装です。<br /> ですけど、結構非効率的ですね。</p> <p>例えば、一番後ろのデータを取得するために最初から全部辿っています。<br /> データにアクセスするためにすべて前からアクセスするのでは、後半のデータにアクセスする際に非効率になってしまいます。<br /> これを解決するのが双方向リストというものです。</p> <p>(次回があれば)これを双方向リストに改造してみたいと思います。<br /> 今日のところはここまでにしましょう。それがいい。</p> frodo821