TSVで定義した辞書をMarkdownで出力するツールを作った話

2019-09-10T20:49:01+09:00

TL;DR

電子辞書が欲しくなったので作ることにしました。今回の要件は、品詞別の索引と全単語の索引、先頭の文字ごとの詳細解説があることです。

そこで辞書本体をmarkdownで書くことにしたのですが、ちまちま手で書くのは面倒くさい。なのでTSVを読み込んでmarkdownを吐くジェネレータを簡単に書いてみることにしました。

TSVに格納する辞書の形式を考えてみた

TSVとは言いつつ、純粋なTSVは使っていません。まずは辞書のヘッダ部分です。

BEGIN_HEADER            
LANGUAGE_LONG   Language Name       
LANGUAGE_CODE   LC(注1)      
PHONETICAL_CHARS    頭文字になりうる文字の列挙(注2)       
END_HEADER

注1 これは2～3文字の言語コードです。ja, enなど
注2 スペース区切りで列挙します。a b c d e f g h i j k l m n o p q r s t u v w x y zのように

続いて、辞書の本体を考えてみました。

BEGIN_DICTIONARY            
単語  品詞ID(注1)    意味  関連語(注2)
END_DICTIONARY

注1 品詞IDは任意の文字列です。
注2 関連語はスペース区切りで列挙します。study learnのように
単語は任意個この形式で列挙します。

最後に、品詞の定義です。

BEGIN_DEFINITION            
品詞ID    品詞の名称       
END_DEFINITION

このフィールドでは、DICTIONARYフィールド内で使用した品詞IDとその名称の対応(NOUNと名詞のような)を定義します。

パーサーをざっくり書いてみる

さて、このパーサーをざっくり書いてみました。

import csv

class ParseError(SyntaxError):
  pass

def open_dict(dic_path: str) -> list:
  with open(dic_path, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    return list(reader)

def parse_dict(dic: list) -> dict:
  ret = {}
  state = 'none'
  for i in dic:
    if i[0] == 'BEGIN_HEADER':
      if state != 'none':
        raise ParseError('Unexpected BEGIN_HEADER tag.')

      state = 'header'
      ret['header'] = {}
      continue

    if i[0] == 'END_HEADER':
      if state != 'header':
        raise ParseError('Unexpected END_HEADER tag.')

      state = 'none'
      continue

    if i[0] == 'BEGIN_DICTIONARY':
      if state != 'none':
        raise ParseError('Unexpected BEGIN_DICTIONARY tag.')

      state = 'dictionary'
      ret['dict'] = {}
      continue

    if i[0] == 'END_DICTIONARY':
      if state != 'dictionary':
        raise ParseError('Unexpected END_DICTIONARY tag.')

      state = 'none'
      continue

    if i[0] == 'BEGIN_DEFINITION':
      if state != 'none':
        raise ParseError('Unexpected BEGIN_DEFINITION tag.')

      state = 'definition'
      ret['defs'] = {}
      continue

    if i[0] == 'END_DEFINITION':
      if state != 'definition':
        raise ParseError('Unexpected END_DEFINITION tag.')

      state = 'none'
      continue

    if state == 'none':
      continue

    if state == 'header':
      ret['header'][i[0]] = i[1]
      continue

    if state == 'dictionary':
      if i[0] not in ret['dict']:
        ret['dict'][i[0]] = {}
      ret['dict'][i[0]][i[1]] = {
        'meaning': i[2],
        'reference': i[3].split(' ')
      }
      continue

    if state == 'definition':
      ret['defs'][i[0]] = i[1]

  if state != 'none':
    raise ParseError(f'A match pair tag of END_{state.upper()} not found.')

  return ret

まぁ、本当に簡単に書いているので、解説することもほとんどないんですけれど……。ざっくり説明すると、tsvを読み込んで2次元配列に格納し、それを先ほど定義したフォーマットに従って辞書に格納しなおしているだけです。

次に、この生成した辞書から索引情報を抽出する関数を定義してみます。

def get_comparator(_order):
  class _Comparator(str):
    def __gt__(self, other):
      order = list(_order)
      for s, o in zip(self, other):
        oi = order.index(o)
        si = order.index(s)
        if oi > si:
          return True
        if si > oi:
          return False
      return len(self) > len(other)

    def __lt__(self, other):
      order = list(_order)
      for s, o in zip(self, other):
        oi = order.index(o)
        si = order.index(s)
        if oi < si:
          return True
        if si < oi:
          return False
      return len(self) < len(other)
  return _Comparator

def generate_indices(dic: dict):
  chars = dic['header'].get(
    'PHONETICAL_CHARS',
    'a b c d e f g h i j k l m n o p q r s t u v w x y z').split(' ')
  nodes = {i: {c: [] for c in chars} for i in dic['defs']}
  nodes['ALPHABETICAL'] = {c: [] for c in chars}
  comp = get_comparator(''.join(chars))

  for word, data in dic['dict'].items():
    nodes['ALPHABETICAL'][word[0]].append(word)
    for kind in data:
      nodes[kind][word[0]].append(word)

  for i in nodes.values():
    for j in i.values(): 
      j.sort(key=comp)

  return nodes

get_comparator関数は、PHONETICAL_CHARSヘッダフィールドで定義された辞書順に従って文字列を比較できるようにするラッパークラスを返す関数です。品詞ごとに単語の頭の文字の配列を作り、そこに単語のみを格納しているだけですね。ALPHABETICALは全単語索引の情報で、定義されたすべての単語が登録されています。

次に意味情報と関連語情報を抽出する関数を定義します。

def generate_dict_content(dic: dict):
  chars = dic['header'].get(
    'PHONETICAL_CHARS',
    'a b c d e f g h i j k l m n o p q r s t u v w x y z').split(' ')
  defs = dic['defs']
  contents = {i: [] for i in chars}
  comp = get_comparator(''.join(chars))

  for word, data in dic['dict'].items():
    contents[word[0]].append({
      'surface': word,
      'meaning': [
        (kind, meta['meaning'].split(' '))
        for kind, meta in data.items()
      ],
      'reference': [ref for d in data.values() for ref in d['reference']]
    })
  for c in contents.values():
    c.sort(key=lambda x: comp(x['surface']))
  return contents

この関数についての解説は、generate_indices関数とほぼ同じ動作のため割愛します。

これでTSVから単語情報を抽出する関数がそろいました。次はこれをmarkdownとして出力する関数を作っていきます。

抽出した情報をMarkdownにしてみる

索引を生成する関数を考えてみました。こんな感じです。

def generate_index_file(kind: str, defs: dict, index: list):
  ret = f"# {defs[kind]}\n"

  if kind == 'ALPHABETICAL':
    ret += "\n## 品詞別インデックス\n"
    for fp, kd in defs.items():
      if fp != 'ALPHABETICAL':
        ret += f"* [{kd}](./{fp.lower()}.md)\n"

  for representative, content in index.items():
    ret += f"\n## {representative.upper()}\n"
    for word in content:
      ret += f"* [{word}](./content/{word[0].upper()}.md#{word})\n"
  return ret

すごいシンプルにかけて満足しています。あまりPythnoicではないと思いますが、そこは気にしないことにします。あと、ALPHABETICALのページに品詞別インデックスへのリンクを表示することにしました。引数の意味ですが、kindは品詞ID、defsは品詞の定義、indexには単語のリストを渡します。

続いて、単語の解説ページを生成する関数を考えてみました。

def generate_content_file(representative: str, words: list, defs: dict):
  ret = f"# {representative.upper()}\n"
  for word in words:
    ret += f"\n## {word['surface']}\n"
    ret += "意味:  \n"
    for i, (k, m) in enumerate(word['meaning']):
      ret += f"{i + 1}. <{defs[k]}>  \n"
      for ml in m:
        ret += f"  {ml}  \n"
    refs = [i for i in word['reference'] if i]
    if refs:
      ret += "\n関連語:  \n"
      for ref in refs:
        ret += f"* [{ref}](./{ref[0].upper()}.md#{ref})\n"
  return ret

引数の意味ですが、representativeは代表の文字(ようするにそのページの単語に共通の頭文字)、wordsは単語とそのメタ情報、defsは品詞の定義をとります。

さて、最後にこれらの関数の動作を連結する関数を書きましょう。それで完成です。

def generate_markdown_files(dic: dict):
  indices = generate_indices(dic)
  content = generate_dict_content(dic)
  chars = dic['header'].get(
    'PHONETICAL_CHARS',
    'a b c d e f g h i j k l m n o p q r s t u v w x y z').split(' ')
  defs = dic['defs'].copy()
  defs['ALPHABETICAL'] = "全単語索引"

  return {
    'content': {
      i: generate_content_file(i, content[i], defs) for i in chars
    },
    'indices': {
      ('index'
       if i == 'ALPHABETICAL'
       else i.lower()): generate_index_file(i, defs, content)
      for i, content in indices.items()
    }
  }

この関数はparseされたTSVをmarkdown形式の文字列に変換する関数です。ここまでに定義した関数を連結して整形された形にするのが役割ですね。

さて、これをファイルにdumpする関数を書いて、それで本当に完成です。

def dump_markdown(dic_path: str, dump_dir: dir):
  dic_path = abspath(dic_path)
  dump_dir = abspath(dump_dir)
  files = generate_markdown_files(parse_dict(open_dict(dic_path)))

  print('generating indices...')
  for rep, con in files['indices'].items():
    path = join(dump_dir, f'{rep}.md')
    d = dirname(path)
    if not exists(d):
      md(d)
    print(f'writing file: {path}')
    with open(path, 'w', encoding='utf-8') as f:
      f.write(con)

  print('generating content...')
  for rep, con in files['content'].items():
    path = join(dump_dir, 'content', f'{rep.upper()}.md')
    d = dirname(path)
    if not exists(d):
      md(d)
    print(f'writing file: {path}')
    with open(path, 'w', encoding='utf-8') as f:
      f.write(con)

  print('done.')

この関数は、コンソールコマンドとして実行されることを想定したものになっています。

最後に、ここまで書いたスクリプトの全体を示しておきます。

#-*- coding: utf-8;-*-

from os import makedirs as md
from os.path import join, exists, dirname, abspath
import csv

class ParseError(SyntaxError):
  pass

def get_comparator(_order):
  class _Comparator(str):
    def __gt__(self, other):
      order = list(_order)
      for s, o in zip(self, other):
        oi = order.index(o)
        si = order.index(s)
        if oi > si:
          return True
        if si > oi:
          return False
      return len(self) > len(other)
    def __lt__(self, other):
      order = list(_order)
      for s, o in zip(self, other):
        oi = order.index(o)
        si = order.index(s)
        if oi < si:
          return True
        if si < oi:
          return False
      return len(self) < len(other)
  return _Comparator

def open_dict(dic_path: str) -> list:
  with open(dic_path, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    return list(reader)

def parse_dict(dic: list) -> dict:
  ret = {}
  state = 'none'
  for i in dic:
    if i[0] == 'BEGIN_HEADER':
      if state != 'none':
        raise ParseError('Unexpected BEGIN_HEADER tag.')

      state = 'header'
      ret['header'] = {}
      continue

    if i[0] == 'END_HEADER':
      if state != 'header':
        raise ParseError('Unexpected END_HEADER tag.')

      state = 'none'
      continue

    if i[0] == 'BEGIN_DICTIONARY':
      if state != 'none':
        raise ParseError('Unexpected BEGIN_DICTIONARY tag.')

      state = 'dictionary'
      ret['dict'] = {}
      continue

    if i[0] == 'END_DICTIONARY':
      if state != 'dictionary':
        raise ParseError('Unexpected END_DICTIONARY tag.')

      state = 'none'
      continue

    if i[0] == 'BEGIN_DEFINITION':
      if state != 'none':
        raise ParseError('Unexpected BEGIN_DEFINITION tag.')

      state = 'definition'
      ret['defs'] = {}
      continue

    if i[0] == 'END_DEFINITION':
      if state != 'definition':
        raise ParseError('Unexpected END_DEFINITION tag.')

      state = 'none'
      continue

    if state == 'none':
      continue

    if state == 'header':
      ret['header'][i[0]] = i[1]
      continue

    if state == 'dictionary':
      if i[0] not in ret['dict']:
        ret['dict'][i[0]] = {}
      ret['dict'][i[0]][i[1]] = {
        'meaning': i[2],
        'reference': i[3].split(' ')
      }
      continue

    if state == 'definition':
      ret['defs'][i[0]] = i[1]

  if state != 'none':
    raise ParseError(f'A match pair tag of END_{state.upper()} not found.')

  return ret

def generate_indices(dic: dict):
  chars = dic['header'].get(
    'PHONETICAL_CHARS',
    'a b c d e f g h i j k l m n o p q r s t u v w x y z').split(' ')
  nodes = {i: {c: [] for c in chars} for i in dic['defs']}
  nodes['ALPHABETICAL'] = {c: [] for c in chars}
  comp = get_comparator(''.join(chars))

  for word, data in dic['dict'].items():
    nodes['ALPHABETICAL'][word[0]].append(word)
    for kind in data:
      nodes[kind][word[0]].append(word)

  for i in nodes.values():
    for j in i.values(): 
      j.sort(key=comp)

  return nodes

def generate_dict_content(dic: dict):
  chars = dic['header'].get(
    'PHONETICAL_CHARS',
    'a b c d e f g h i j k l m n o p q r s t u v w x y z').split(' ')
  defs = dic['defs']
  contents = {i: [] for i in chars}
  comp = get_comparator(''.join(chars))

  for word, data in dic['dict'].items():
    contents[word[0]].append({
      'surface': word,
      'meaning': [
        (kind, meta['meaning'].split(' '))
        for kind, meta in data.items()
      ],
      'reference': [ref for d in data.values() for ref in d['reference']]
    })
  for c in contents.values():
    c.sort(key=lambda x: comp(x['surface']))
  return contents

def generate_content_file(representative: str, words: list, defs: dict):
  ret = f"# {representative.upper()}\n"
  for word in words:
    ret += f"\n## {word['surface']}\n"
    ret += "意味:  \n"
    for i, (k, m) in enumerate(word['meaning']):
      ret += f"{i + 1}. <{defs[k]}>  \n"
      for ml in m:
        ret += f"  {ml}  \n"
    refs = [i for i in word['reference'] if i]
    if refs:
      ret += "\n関連語:  \n"
      for ref in refs:
        ret += f"* [{ref}](./{ref[0].upper()}.md#{ref})\n"
  return ret

def generate_index_file(kind: str, defs: dict, index: list):
  ret = f"# {defs[kind]}\n"

  if kind == 'ALPHABETICAL':
    ret += "\n## 品詞別インデックス\n"
    for fp, kd in defs.items():
      if fp != 'ALPHABETICAL':
        ret += f"* [{kd}](./{fp.lower()}.md)\n"

  for representative, content in index.items():
    ret += f"\n## {representative.upper()}\n"
    for word in content:
      ret += f"* [{word}](./content/{word[0].upper()}.md#{word})\n"
  return ret

def generate_markdown_files(dic: dict):
  indices = generate_indices(dic)
  content = generate_dict_content(dic)
  chars = dic['header'].get(
    'PHONETICAL_CHARS',
    'a b c d e f g h i j k l m n o p q r s t u v w x y z').split(' ')
  defs = dic['defs'].copy()
  defs['ALPHABETICAL'] = "全単語索引"

  return {
    'content': {
      i: generate_content_file(i, content[i], defs) for i in chars
    },
    'indices': {
      ('index'
       if i == 'ALPHABETICAL'
       else i.lower()): generate_index_file(i, defs, content)
      for i, content in indices.items()
    }
  }

def dump_markdown(dic_path: str, dump_dir: dir):
  dic_path = abspath(dic_path)
  dump_dir = abspath(dump_dir)
  files = generate_markdown_files(parse_dict(open_dict(dic_path)))

  print('generating indices...')
  for rep, con in files['indices'].items():
    path = join(dump_dir, f'{rep}.md')
    d = dirname(path)
    if not exists(d):
      md(d)
    print(f'writing file: {path}')
    with open(path, 'w', encoding='utf-8') as f:
      f.write(con)

  print('generating content...')
  for rep, con in files['content'].items():
    path = join(dump_dir, 'content', f'{rep.upper()}.md')
    d = dirname(path)
    if not exists(d):
      md(d)
    print(f'writing file: {path}')
    with open(path, 'w', encoding='utf-8') as f:
      f.write(con)

  print('done.')

if __name__ == '__main__':
  from sys import argv
  dump_markdown(argv[1], argv[2])

はー。疲れました。はい、これでおそらくどの方面にも需要がないツールの完成です。「欲しかったから作った」の真骨頂ですね。最後までお付き合いいただき、ありがとうございました。

「TSV」の記事 - Crieit

TSVで定義した辞書をMarkdownで出力するツールを作った話

TL;DR

TSVに格納する辞書の形式を考えてみた

パーサーをざっくり書いてみる

抽出した情報をMarkdownにしてみる