2022-02-06に更新

タイトルから書籍情報を探す。

読了目安:15分

データを詰め込んだデータベースから、本のタイトルを取り出して、もしあれば著者名を取り出して、国会図書館のNDL サーチで、タイトルを検索して、これかな ?? という書籍情報をデータベースに詰め込むプログラム。

何をするためかというと、タイトルしかわからないマンガコミックの出版社の情報や、ISBN など詳細をたどって得るため。ISBN からタイトル情報へ行き着くものはあるが逆のものは知らないため。

それ以上説明は省くが、こちらからさかのぼっていくと、きっとわかる。
https://crieit.net/boards/manga-B/fc4a0259928ca001b58935ddd7cbb322

さかのぼる

require "faraday"
require "faraday/net_http"
require "net/http"
require 'nokogiri'
require 'sqlite3'
require 'time'
require 'date'

class NdlSearch

  def get_book_info(title, creator = nil)
    data = []
    query = {
      :mediatype => 1,
      :cnt => 100
    }
    query[:title] = title
    query[:creator] = creator if creator
    if creator == ''
        puts "::::::::::::::::::::::::::::::::::::"
        puts 'author :??'
    end
    puts
    print "query :#{query}"
    puts
    response = ndl_get('/api/opensearch', query)

    xml = Nokogiri::XML(response.body)
    xml.remove_namespaces!
    items = xml.xpath('//item')
    unless items.any? then
        puts
        puts 'ndl has no item'
        data << {"totalResults"=>"0"}
    else
    #pp items.to_s
        items.each do |item|
          #puts
          #puts "item :", item
          book = {}
          threads1 = [] # concurrency
          item.children.each do |c|
            threads1 << Thread.new {
              key = c.name
              next if key == 'text'
              val = "#{c.content}"
              label = c.attribute("type")
              if label
                label = "#{label}".gsub(/^dcndl:|^dcterms:/,'')
                book[label] ||= []
                book[label] << val unless book[label].include?(val)
                val = "#{label}:#{val}"
              end
              book[key] ||= []
              book[key] << val unless book[key].include?(val)
            }
          end
          threads1.each{ |thr| thr.join }
          book = book.map {|key,val| [key, val.join(',')]}.to_h
          data << book
        end
    end
    data
  end

  private

  def ndl_get(path, pram)
    con = Faraday.new(:url => 'https://iss.ndl.go.jp') do |f|
      f.request  :url_encoded
      #f.response :logger
      f.adapter :net_http
    end
    con.get path, pram
  end
end

#DB

SQL =<<EOS
create table tbl_bookdata (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    book_title text,
    url text,
    author text,
    creatortranscription text,
    volume text,
    seriestitle text,
    publisher text,
    isbn text,
    date text,
    W3CDTF integer,
    mangathank_title text,
    ex_id integer,
    tags text
);
EOS

count = 0
new_db = SQLite3::Database.open("bookdata_fbay_py_3.db")
#new_db = SQLite3::Database.open("bookdata_fbay_py.db")
db = SQLite3::Database.open("../gotest/fbay_python3.db")
#db = SQLite3::Database.open("fbay_python3.db")
#new_db = SQLite3::Database.open("bookdata.db")
#db = SQLite3::Database.open("mangathank_new.db")

#new_db.execute(SQL)

temp_author = ''
temp_title = ''

$index = 0

db.execute("select id from tbl_manga order by id desc limit 1 ;") do |data|
    $index = data[0].to_i
    print "last id : '#{$index}'"
    puts
end

looptimes = $index 
new_db.execute("select id from tbl_bookdata order by id desc limit 1 ;") do |data|
    count = data[0].to_i
end

start = count + 1 
#threads = []
#m = Mutex.new
(start..looptimes).each do |api|
#  (0..4).each do |pac|
    fiber = Fiber.new do
    if count >= looptimes - 1 then
      break
    end
#    threads &lt;&lt; Thread.new {
#      m.synchronize{
      count += 1
      puts
      puts "::::::::::::::::::::::::::::::::::::::::"
      print 'id:',count,' '
      search_data = db.execute("select book_title,author,title,id,tags from tbl_manga where id ='#{count}' ;")
      if search_data.empty? then
        puts 'empty'
      end
      *book_data = search_data.pop
      #book_data[0] #=> book_title
      #book_data[1] #=> author
      #book_data[2] #=> title
      #book_data[3] #==> id
      #book_data[4] #==> tags
      mangathank_title = book_data[2].to_s.gsub(/\'/, "\'\'")
      tags = book_data[4].to_s.gsub(/\'/, "\'\'")

      puts book_data[2]

      if book_data[2] == "null" then
          p count
          pp book_data
#          new_db.execute(&quot;insert into tbl_bookdata (book_title, author, mangathank_title, ex_id, tags ) values('book_title:nothing','author:nothing','#{mangathank_title}','#{book_data[3]}','#{tags}');&quot;)
          new_db.execute("insert into tbl_bookdata (id, book_title, author, mangathank_title, ex_id, tags ) values('#{count}','book_title:nothing','author:nothing','#{mangathank_title}','#{book_data[3]}','#{tags}');")
      else
          author_data = book_data[2].to_s.slice(/((?<=\[).*?(?=\]))/)
          #puts "author_dat:#{author_data}"
          if author_data != nil
              author_data.gsub!(/\ x\ /,' ')
              author_data.sub!(/((?<=[\p{Hiragana}\p{Han}\p{Katakana}])x(?=[\p{Hiragana}\p{Han}\p{Katakana}]))/,' ')
              author_data.gsub!(/\(|\)/,"\(" =>' ',"\)"=>'')
              author_data.gsub!(/×/,' ')
              author_data.gsub!(/\ &/,' ')
          end
          if /(\ )/.match(author_data) then
              #/(\S+$)/.match(author_data)
              #person = /(?<=['\ '])\S.*$/.match(author_data)
              #str_array = person.to_s.split
              str_array = author_data.to_s.split
              person = str_array.pop
          else
              person = author_data.to_s
          end
          print("author_data: ",  author_data , "  person: " , person)
          puts
          num = book_data[2].to_s.slice(/((?<=第)\d+(?=巻|卷$))/)
          #num = /((?<=第)\d+(?=巻$))/.match(book_data[0].to_s)
          #book_data_0 = book_data[0].to_s.sub(/((?=第).*巻)/,'')
          book_data_0 = book_data[2].to_s.gsub(/((?=第).*(巻|卷))/,'')
          book_data_0.gsub!(/((?=第).*話)/,'')
          book_data_0.gsub!(/(.(?<=\()文庫版(?=\)).)/,'')
          book_data_0.gsub!(/(.(?<=\[)文庫版(?=\]).)/,'')
          book_data_0.gsub!(/文庫版/,'')
          book_data_0.gsub!(/フルカラー版/,'')
          book_data_0.gsub!(/カラー版/,'')
          book_data_0.gsub!(/(.(?<=\()完(?=\)).)/,'')
          book_data_0.gsub!(/(.(?<=【).*(?=】).)/,'')
          book_data_0.gsub!(/(.(?<=\[).+?(?=\]).)/,'')
          book_data_0.lstrip!
          book_data_0.rstrip!

          if book_data_0 == temp_title then
              if str_array then
                  person = temp_author
              end
          else
              temp_title = book_data_0
              temp_author = person
          end

          if num != nil then
              num = num.to_i
              book_data_0 += ' ' + num.to_s
          end
          puts
          #puts book_data[0]
          puts book_data_0
          puts
          ndl_search = NdlSearch.new

          onemore = 'true'

          while onemore == 'true' do
              res =  ndl_search.get_book_info( book_data_0,person )
              onemore = 'false'
#              puts res[0]
              if res[0] == nil then
                  puts "res: empty"
                  book_data_0.gsub!(/\'/,"\'\'")
                  puts book_data_0
#                  new_db.execute(&quot;insert into tbl_bookdata (book_title, author, mangathank_title, ex_id,tags ) values('#{book_data_0}','#{author_data}','#{mangathank_title}','#{book_data[3]}','#{tags}');&quot;)
                  new_db.execute("insert into tbl_bookdata (id, book_title, author, mangathank_title, ex_id,tags ) values('#{count}','#{book_data_0}','#{author_data}','#{mangathank_title}','#{book_data[3]}','#{tags}');")
                  onemore = 'false'
              else
                  done = false
                  res[0..99].each_with_index do |book,index|
                      if done == true then
                        break
                      end
                      not_book = false
                      book.each do |key, val|
                          if key == "extent" then
                              puts "#{key}:#{val}"
                              if /ビデオ|DVD|dvd|ディスク/.match?(val) then
                                  not_book = true
                                  puts ''
                                  puts 'SKIP'
                                  break
                              end
                          end
                      end
                      if not_book == true then
                        next
                      end
                      if book != "null" then
#                         puts  &quot;res:#{book}&quot; 
#                         puts &quot; :#{index}&quot;
                         book.each do |key, val|
#                             puts &quot;#{key}:#{val}&quot;
                             if key == 'totalResults' then
                                  #puts
                                  #print "no match title name #{person} ",book_data[3],'  '
                                  book_data_0.gsub!(/\'/,"\'\'")
                                  #puts book_data_0,person,mangathank_title
                                  unless str_array.nil? then
                                      if str_array.size > 0 then
                                          person = str_array.shift
                                          puts
                                          print "#{person} ?"
                                          puts
                                          puts
                                          onemore = 'true'
                                          #sleep 3
                                          break
                                      else
#                                          new_db.execute(&quot;insert into tbl_bookdata (book_title, author, mangathank_title, ex_id,tags ) values('#{book_data_0}','#{author_data}','#{mangathank_title}','#{book_data[3]}','#{tags}');&quot;)
                                          new_db.execute("insert into tbl_bookdata (id, book_title, author, mangathank_title, ex_id,tags ) values('#{count}', '#{book_data_0}','#{author_data}','#{mangathank_title}','#{book_data[3]}','#{tags}');")
                                          onemore = 'false'
                                          #sleep 10
                                          break
                                      end

                                  else    
#                                      new_db.execute(&quot;insert into tbl_bookdata (book_title, author, mangathank_title, ex_id, tags) values('#{book_data_0}','#{author_data}','#{mangathank_title}','#{book_data[3]}','#{tags}');&quot;)
                                      new_db.execute("insert into tbl_bookdata (id, book_title, author, mangathank_title, ex_id, tags) values('#{count}', '#{book_data_0}','#{author_data}','#{mangathank_title}','#{book_data[3]}','#{tags}');")
                                      onemore = 'false'
                                  end
                                  break
                              end


                              if key == 'title' then
                                  temp_author = person
                                  puts "☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆"
                                  puts "     #{count}"
                                  puts ""
                                  puts "#{key}:#{val}"
                                  title = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("insert into tbl_bookdata (id, mangathank_title, ex_id, tags ) values('#{count}', '#{mangathank_title}','#{book_data[3]}','#{tags}');")
                                  new_db.execute("update tbl_bookdata set book_title = '#{title}' where id = '#{count}';")
                              elsif key == 'author' then
                                  puts "#{key}:#{val}"
                                  author = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set author = '#{author}' where id = '#{count}';")
                              elsif key == 'creatorTranscription' then
                                  puts "#{key}:#{val}"
                                  creatortranscription = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set creatortranscription = '#{creatortranscription}' where id = '#{count}';")
                              elsif key == 'volume' then
                                  volume = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set volume = '#{volume}' where id = '#{count}';")
                              elsif key == 'link' then
                                  url = val
                                  new_db.execute("update tbl_bookdata set url = '#{url}' where id = '#{count}';")
                              elsif key == 'publisher' then
                                  puts "#{key}:#{val}"
                                  publisher = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set publisher = '#{publisher}' where id = '#{count}';")
                              elsif key == 'ISBN' then
                                  puts "#{key}:#{val}"
                                  puts ""
                                  puts "☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆☆"
                                  isbn = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set isbn = '#{isbn}' where id = '#{count}';")
                              elsif key == 'seriesTitle' then
                                  puts "#{key}:#{val}"
                                  seriestitle = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set seriestitle = '#{seriestitle}' where id = '#{count}';")

                              elsif key == 'date' then
                                  puts "#{key}:#{val}"
                                  published_date = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set date = '#{published_date}' where id = '#{count}';")
                              elsif key == "W3CDTF" then
                                  puts "#{key}:#{val}"
                                  puts ""
                                  w3cdtf = val.to_s.gsub(/\'/, "\'\'")
                                  new_db.execute("update tbl_bookdata set W3CDTF = '#{w3cdtf}' where id = '#{count}';")

                              else
                                  #new_db.execute("update tbl_bookdata set author = '', creatortranscription = '', volume = '',  url = '', publisher = '', isbn = '', seriestitle = '' ;")
                                  onemore = 'false'
                                  done = true
                              end
                          end
                      else
                          onemore = 'false'
                          puts "error"
                          mangathank_title = book_data[2].to_s.gsub(/\'/, "\'\'")
#                          new_db.execute(&quot;insert into tbl_bookdata (author, mangathank_title, ex_id ) values('#{author_data}','#{mangathank_title}','#{book_data[3]}');&quot;)
                          new_db.execute("insert into tbl_bookdata (id, author, mangathank_title, ex_id ) values('#{count}','#{author_data}','#{mangathank_title}','#{book_data[3]}');")
                      end
                  end
              end
          end
      end
#    }
#  }
  end
  fiber.resume
#  threads.each {|th| th.join}
end
ツイッターでシェア
みんなに共有、忘れないようにメモ

view_list マンガサイトにつひて
第2回 スキャンレーション
第3回 ある一つのサイトについての
第4回 タイトルから書籍情報を探す。
第5回 漫画Bank / La « mangabank.org » a disparu.
第6回 `_why`

tomato

Crieitは誰でも投稿できるサービスです。 是非記事の投稿をお願いします。どんな軽い内容でも投稿できます。

また、「こんな記事が読みたいけど見つからない!」という方は是非記事投稿リクエストボードへ!

有料記事を販売できるようになりました!

こじんまりと作業ログやメモ、進捗を書き残しておきたい方はボード機能をご利用ください。
ボードとは?

コメント