マンガサイトを調査 実践。( Ruby )

有効な URL と 有効ではない URL を調査し、データベースに記録しよう。

Ruby 言語を使って https リクエストして調査してみましょう。 有効なURLとは漫画の画像があるページのことになります。すなわち、マンガのサイトにある全てのコンテンツの数を全部数えてみようということになります。 対象は漫画Bankです。

Python 言語の場合はこっちにあります。

kuroca.hatenablog.com

ポリシーについてはこちら

kuroca.hatenablog.com

  • レポート
    f:id:kuroca:20211106114710p:plain
    Cloudflare ドメイン
    この日本語の翻訳ニュースの配信の約1時間後、漫画bankはサイトを閉鎖。 本稿以降のプログラムは対象のサイトが消失。

curl -is "https://mangabank.org/watch/?tour=/vol" | grep location

ターミナルコマンドを実行してリダイレクトされるアドレスを表示するには上記のようになりま。

require 'net/http'
require 'uri'
require 'sqlite3'


SQL =<<EOS
create table url (
    id INTEGER PRIMARY KEY,
    url text
    );
EOS

## ----------------------------------
# _INVALID DB ===> DB1
db1 = SQLite3::Database.open("mb_urls_INVALID.db")
tb1 = db1.execute("SELECT COUNT(*) FROM sqlite_master WHERE TYPE='table' AND NAME='url';")
if tb1[0][0] == 0 then
    db1.execute(SQL)
    last_id1 = 0
else
    last1 = db1.execute("SELECT id FROM url ORDER BY id DESC LIMIT 1;")
    puts "invalid"
    pp last1[0][0]
    puts " ---------------------------------- "
    last_id1 = last1[0][0]
end

# ----------------------------------
# _VALID DB ===> DB
db = SQLite3::Database.open("mb_urls_VALID.db")
tb = db.execute("SELECT COUNT(*) FROM sqlite_master WHERE TYPE='table' AND NAME='url';")
if tb[0][0] == 0 then
    db.execute(SQL)
    last_id = 0
else
    last = db.execute("SELECT id FROM url ORDER BY id DESC LIMIT 1;")
    puts "valid"
    pp last[0][0]
    puts " ---------------------------------- "
    last_id = last[0][0]
end

count = last_id
count1 = last_id1

# ----------------------------------
target_url ="https://mangabank.org/watch/?tour=/vol/"

#uri = URI.parse(target_url)

# extract redirect URL
def get_redirect_url(uri)
    redirect = Net::HTTP.get_response(uri)['location']
    return redirect
end

# ----------------------------------

50000.times do |repeat|

    uri = URI.parse(target_url)
    redirect_url = get_redirect_url(uri)
    puts redirect_url
    address = redirect_url

    s_number = address.split('/')[-1]
    puts s_number
    number = s_number.to_i 
    
    def dbr(address,number,db1,count1,db,count,second)
        gate = false
        again = false
        11.times do |x|
            puts " ---------------------------------- "
            puts x
            ###### <1 check invalid db
            if x == 0 then
                puts " ---------------------------------- "
                flag1 = nil
                flag1 = db1.execute("SELECT id FROM url WHERE url=\"#{address}\" ;")
                if !flag1.empty? then
                    puts "flag 1:INVALID DB id=#{flag1[0][0]} #{address}"
                    db1.execute("UPDATE url SET url='null' WHERE id=#{flag1[0][0]} ;")
                    puts "is VALID / remove from INVALID DB"
                    gate = true
                end
                if second != true then
                    number = number - 10    
                    address = "https://mangabank.org/#{number}/"
                end
            end
            ###### 1>
            if x != 0 then
                number = number + 1
                address = "https://mangabank.org/#{number}/"
                puts " ---------------------------------- "
                puts address
                ###### <2
                flag2 = nil
                flag2 = db1.execute("SELECT id FROM url WHERE url=\"#{address}\" ;")

                if !flag2.empty? then
                    puts "flag 2:INVALID DB id=#{flag2[0][0]} #{address}"
                    puts "#{number} + INVALID"

                    if gate != true then
                        next
                    end
                end
                ###### 2>
                puts " ---------------------------------- "
                uri = URI.parse(address)
                res = Net::HTTP.get_response(uri)
                puts "#{address} responce code: #{res.code}"

                if res.code != '200' then
                    count1 += 1
                    puts "insert INVALID DB id: #{count1} -> #{address}"
                    db1.execute("INSERT INTO url (id,url) VALUES (?,?)",count1,address )
#                    db1.commit
                    
                    if second == true then
                        break
                    end

                    next

                elsif !flag2.empty? and res.code == '200' then
                    puts "flag 2:INVALID DB id=#{flag2[0][0]} + #{address}"
                    db1.execute("UPDATE url SET url='null' WHERE id=#{flag2[0][0]} ;")
                    puts "is VALID / remove from INVALID DB"
                end
            end

            flag = nil
            flag = db.execute("select id from url where url=\"#{address}\" ;")

            if !flag.empty? then
                puts "flag 0: id=#{flag[0][0]} #{address}"
                puts "already exsist in DB"
                puts
                break
            else
                if x == 0 then
                    uri = URI.parse(address)
                    res = Net::HTTP.get_response(uri)
                    puts "#{address} responce code: #{res.code}"
                end

                if res.code == '200' then
                    count += 1
                    puts "insert VALID DB id: #{count} -> #{address}"
                    db.execute("INSERT INTO url (id,url) VALUES (?,?)",count,address )
#                    db.commit
                    again = true
                    next
                elsif res.code != '200' then
                    count1 += 1
                    puts "insert NOT-VALID-DB id: #{count1} -> #{address}"
                    db1.execute("INSERT INTO url (id,url) VALUES (?,?)",count1,address )
#                    db1.commit
                end
            end
        end
        return count1,count,address,again
    end

    second = false
    count1,count,address,again = dbr(address,number,db1,count1,db,count,second)

    puts

    limit = 5
    while again == true do
        puts " ---------------------------------- "
        puts "      " + limit.to_s
        number = number + 1
        address = "https://mangabank.org/#{number}/"
        puts address
        uri = URI.parse(address)
        res = Net::HTTP.get_response(uri)
        puts "#{address} responce code: #{res.code}"
        
        if res.code == '200' then
            second = true
            count1,count,address,again = dbr(address,number,db1,count1,db,count,second)
            limit = 5
        else
            limit -= 1
            puts "   Retry #{limit}"
        ###### <3
            flag1 = nil 
            flag1 = db1.execute("SELECT id FROM url WHERE url=\"#{address}\" ;")

            if !flag1.empty? then
                puts "flag 3: id=#{flag1[0][0]} #{address}"
                puts "#{number} INVALID"

                if limit < 1 then
                    again = false
                end
                next
            end
        ###### 3>
            count1 += 1
            puts "insert INVALID DB id: #{count1} -> #{address}"
            db1.execute("INSERT INTO url (id,url) VALUES (?,?)",count1,address )
##            db1.commit
            if limit < 1 then
                again = false
            end
        end
    end
end

db.close
db1.close

2つのデータベースをマージするためのもの。Ruby

##---- Lua
##local sqlite3 = require("lsqlite3")
##
##--[[local db1 = sqlite3.open('lua_mb_urls_not.db') -- Lua
##local db2 = sqlite3.open('mb_urls_not.db') -- other
##
##local newdb = sqlite3.open('mb_url_notadditive.db') -- for working space
##local newdb1 = sqlite3.open('lua_mb_url_not1.db') -- additive & order by url
##]]
##---- Lua

####----> Ruby >
##db1 = SQLite3::Database.open "lua_mb_urls_not.db" # Lua
##db2 = SQLite3::Database.open "mb_urls_not.db" # made from python code
##newdb = SQLite3::Database.open "mb_url_notadditive.db" # additive work space
##newdb1 = SQLite3::Database.open "mb_url_not1.db" # additive & order by url
####----< Ruby <

##---- Lua
##local db1 = sqlite3.open('lua_mb_urls_new.db') -- Lua
##local db2 = sqlite3.open('mb_urls_new.db') -- other
##
##local newdb = sqlite3.open('mb_url_additive.db') -- for working space
##local newdb1 = sqlite3.open('lua_mb_url_new1.db')
##
##newdb:exec[[
##  CREATE TABLE IF NOT EXIST url (id INTEGER PRIMARY KEY, url text);
##]]
##
##newdb1:exec[[
##  CREATE TABLE IF NOT EXIST url (id INTEGER PRIMARY KEY, url text);
##]]
##---- Lua

##----> Ruby >
require 'sqlite3'

db1 = SQLite3::Database.open "lua_mb_urls_new.db" # Lua
db2 = SQLite3::Database.open "mb_urls_new.db" # made from python code
newdb = SQLite3::Database.open "mb_url_additive.db" # additive work space
newdb1 = SQLite3::Database.open "mb_url_new1.db" # additive & order by url

SQL =<<EOS
create table IF NOT EXISTS url(
    id INTEGER PRIMARY KEY,
    url text
    );
EOS

newdb.execute(SQL)
newdb1.execute(SQL)

##----< Ruby <

##---- Lua
##local smt1 = "SELECT id FROM url ORDER BY id DESC LIMIT 1 ;" 
##
##local last1 = 0
##local last2 = 0
##
##for id in db1:urows(smt1) do
##    last1 = id
##end
##
##for id in db2:urows(smt1) do
##    last2 = id
##end
##
##print("db1 has "..last1.." URLs")
##print("db2 has "..last2.." URLs")
##---- Lua

##----> Ruby >
last1 = db1.execute("select id from url order by id desc limit 1")
last2 = db2.execute("select id from url order by id desc limit 1")

k = last1[0].pop # offset
l = last2[0].pop

puts "db1 has #{k} URLs"
puts "db2 has #{l} URLs"

i = 0

newdb_last = newdb.execute("select id from url order by id desc limit 1")
if !newdb_last.empty? then
    newcount = newdb_last[0].pop # offset
else
    newcount = 0
end
##----< Ruby <

##---- Lua
##local newcount = 0
##for last_id in newdb:urows(smt1) do
##    newcount = last_id
##end
##
##local i = 0
##
##for url_data in db2:urows("SELECT url FROM url ;") do
##--[[
##for url_data in db2:urows("SELECT url FROM url ORDER BY url ;") do
##]]
##    i = i + 1
##    print("copy from db2 ")
##    print(i)
##    print("total "..last2)
##
##    local existflag = 0
##
##    for exist in newdb:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do
##        existflag = exist
##        print(url_data.." is aleady exist in DB / skip")
##    end
##
##    if existflag == 0 then -- record
##        newcount = newcount + 1
##        print()
##        print("newdb : "..newcount.." "..url_data)
##        local stmt2 = newdb:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
##        stmt2:bind_names{  id = newcount,  url = url_data    }
##        stmt2:step()
##        stmt2:reset()
##        stmt2:finalize()
##    end
##end
##
##db2:close()
##
##i = 0 -- reset
##
##for url_data in db1:urows("SELECT url FROM url ;") do
##--[[
##for url_data in db1:urows("SELECT url FROM url ORDER BY url ;") do
##]]
##    i = i + 1
##    print("copy from db1 ")
##    print(i)
##    print("total "..last1)
##
##    local existflag = 0
##    
##    for exist in newdb:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do
##        existflag = exist
##        print(url_data.." is aleady exist in DB / skip")
##    end
##
##    if existflag == 0 then -- record
##        newcount = newcount + 1
##        print()
##        print("newdb : "..newcount.." "..url_data)
##        local stmt2 = newdb:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
##        stmt2:bind_names{  id = newcount,  url = url_data    }
##        stmt2:step()
##        stmt2:reset()
##        stmt2:finalize()
##    end
##end
##
##db1:close()
##---- Lua

##----> Ruby >
l.times do |index|
    i += 1
    res = db2.execute("select url from url where id='#{i}' ;")
    if res.empty? then
        puts "no data / skip"
        next
    end
    x = res[0][0] # url
    puts "copy from db2"
    p i,l

    res2 = newdb.execute("select url from url where url='#{x}' ;")
    if !res2.empty? then
        p  x
        puts 'already exist in db / skip'
      next
    end
    puts
    newcount += 1
    url = x
    #url.gsub!(/\'/,"\'\'")
    puts "#{newcount}  #{url}"
    newdb.execute("insert into url (id, url) values( '#{newcount}','#{url}') ;")
end

db2.close

i = 0
k.times do |index|
    i += 1
    res = db1.execute("select url from url where id='#{i}' ;")
    if res.empty? then
        puts "no data / skip"
        next
    end
    x = res[0][0] # url
    f = newdb.execute("select url from url where url='#{x}' ;")
    puts "copy from db1"
    p i,k

    if !f.empty? then
        p res[0]
        puts 'already exist in db / skip'
        next
    else
        newcount += 1
        url = x
        #url.gsub!(/\'/,"\'\'")
        puts "#{newcount}  #{url}"
        newdb.execute("insert into url (id, url ) values( '#{newcount}','#{url}') ;")
    end
end

db1.close
##----< Ruby <

##---- Lua
##print("new DB last id : "..newcount)
##---- Lua

##----> Ruby >
puts "new DB last id:#{newcount}"
##----< Ruby <

##---- Lua
##local newdb1_last = 0
##for last_id in newdb1:urows(smt1) do
##    newdb1_last = last_id
##end
##
##local counter = newdb1_last -- first time ==> 0
##---- Lua

##----> Ruby > 
newdb1_last = newdb1.execute("select id from url order by id desc limit 1")
if !newdb1_last.empty? then
    counter = newdb1_last[0].pop # offset
else
    counter = 0
end
##----< Ruby <

##---- Lua
##i = 0 -- reset
##---- Lua

##----> Ruby >
i = 0
##----< Ruby <

##---- Lua
##for url_data in newdb:urows("SELECT url FROM url ORDER BY url ;") do
##    i = i + 1
##    print("copy from newdb ")
##    print(i)
##    print("total "..newcount)
##
##    local existflag = 0
##    
##    for exist in newdb1:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do
##        existflag = exist
##        print(url_data.." is aleady exist in DB / skip")
##    end
##
##    if existflag == 0 then -- record
##        counter = counter + 1
##        print()
##        print("newdb1 : "..counter.." "..url_data)
##        local stmt2 = newdb1:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
##        stmt2:bind_names{  id = counter,  url = url_data    }
##        stmt2:step()
##        stmt2:reset()
##        stmt2:finalize()
##    end
##end
##
##--[[
##newdb1:close() 
##newdb:close()
##]] -- end --
##---- Lua

##----> Ruby >
newdb.execute("select url from url order by url ;") do |row|
    i += 1
    url = row[0]
    #url.gsub!(/\'/,"\'\'")
    f = newdb1.execute("select url from url where url='#{url}' ;")
    puts "copy from newdb"
    p i,newcount

    if !f.empty? then
        p url
        puts 'already exist in db / skip'
        next
    else
        counter = counter + 1
        puts "#{counter}  #{url}"
        newdb1.execute("insert into url (id, url ) values( '#{counter}','#{url}') ;")
    end
end

newdb1.close
newdb.close
##----< Ruby <