漫画Bank マンガサイトを調査 実践。

有効な URL と 有効ではない URL を調査し、データベースに記録しよう。

Lua 言語で cURL のライブラリを使って https リクエストして調査してみましょう。

Python 言語の場合はこっちにあります。

kuroca.hatenablog.com

kuroca.hatenablog.com

kuroca.hatenablog.com

curl -is "https://mangabank.org/watch/?tour=/vol" | grep location

これは、上記のターミナルコマンドを実行してリダイレクトされるアドレスを記録していくプログラムです。 ただ、一度噛みつくと深く掘り返すようになっています。

local sqlite3 = require("lsqlite3")
local curl = require('cURL')
local db = sqlite3.open('lua_mb_urls_new.db')
local db1 = sqlite3.open('lua_mb_urls_not.db')


db:exec[[
  CREATE TABLE url (id INTEGER PRIMARY KEY, url);
]]
db1:exec[[
  CREATE TABLE url (id INTEGER PRIMARY KEY, url);
]]


--[[ lsqlite3 usage ]]-- 
--for row in db:nrows("SELECT * FROM url") do
--  print(row.id, row.url)
--end

--for row in db:rows("SELECT * FROM url") do
--  print(row[1], row[2])
--end

--for id,url in db:urows("SELECT * FROM url") do
--  print(id, url)
--end


local id_count = 0

for bigloop=1,1000 do

local easy = curl.easy{
    url            = "https://mangabank.org/watch/?tour=/vol",
  --followlocation = true,  -- true: 1
    followlocation = 0, -- false: 0
    maxredirs      = 2,
    [curl.OPT_VERBOSE] = 0,
}

local buffer = {}
easy:setopt_writefunction(table.insert, buffer)
-----------------------------------------------------------------
--[[ hettp request ]]--
easy:perform()

local res_code = easy:getinfo_response_code()
local res_location = easy:getinfo_redirect_url() -- 'URL'... redirect to 
print()
--print("code:",res_code) -- always 301 not 200
print("redirect URL:",res_location)

local address_code = res_location:match("%d+") -- "[0-9]+" string
local address_number = tonumber(address_code)

--[[ example: 'https://mangabank.org/2457593/' --]]

local smt1 = "SELECT id,url FROM url WHERE url=".."\""..res_location.."\""
--print(smt1)

local exist = false
---- VALID DB
for id,url in db:urows(smt1) do
-- print(id,url)
    exist = true
end

local gate = false
local remove = false
if exist == true then
    print(res_location.." is aleady exist in VALID-DB")
else-- exist == false
    gate = true

---- INVALID DB
    for id,url in db1:urows(smt1) do
        remove = true
        print(res_location.." is VALID.remove from INVALID-DB")
        local null_smt1 = "UPDATE url SET url='null' WHERE id="..id..";" 
        assert(db1:exec(null_smt1))
    end
    local id_count = 0
    for id in db:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do
        id_count = id
    end
    id_count = id_count + 1
    print(res_location.." record in VALID-DB id:"..tostring(id_count))
    local stmt2 = db:prepare[[ INSERT INTO url VALUES (:id, :url) ]]

    stmt2:bind_names{  id = id_count,  url = res_location    }
    stmt2:step()
    stmt2:reset()
    stmt2:finalize()
end

-----------------------------------------------------------------
if gate  == true then
    address_number = address_number - 11
    local record = false
    for i=1,20 do
        print("VALID DB DATA size :"..id_count)
        address_number = address_number + 1
        local next_url = "https://mangabank.org/"..tostring(address_number).."/"
        print()
        print("test:",next_url)
    
        smt1 = "SELECT id,url FROM url WHERE url=".."\""..next_url.."\""
        --print(smt1)
    
        local exist = false
        ---- VALID DB == db
        for id,url in db:urows(smt1) do
        -- print(id,url)
            exist = true
        end
        if exist == true then
            print(next_url.." is aleady exist in VALID-DB")
        else-- exist == false
            -- INVALID DB == db1
            for id,url in db1:urows(smt1) do
                exist = true
                if remove ~= true then
                    print(next_url.." is aleady exist in INVALID-DB")
                    goto continue -- go to next loop
                end
            end
            -- http request
            if nexe_url == res_location then
                res_code = 200
                goto recording -- skip http request
            end
            easy:setopt(curl.OPT_URL,next_url)
            easy:perform()
            res_code = easy:getinfo_response_code()
            
            ::recording::
            if res_code == 200 then
                record = true
                --local id_count = 0
                for id in db:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do
                    id_count = id
                end
                id_count = id_count + 1
                print(res_code,next_url.." record in VALID-DB id:"..tostring(id_count))
                print()
                local stmt2 = db:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
            
                stmt2:bind_names{  id = id_count,  url = next_url    }
                stmt2:step()
                stmt2:reset()
                stmt2:finalize()
            else
                print(res_code,next_url.." is INVALID")
                print()
                record = false
    
                ---- INVALID-DB
                if remove == true then
                    for id,url in db1:urows(smt1) do
                        print(next_url.." is aleady exist in INVALID-DB")
                        goto continue -- go to next loop
                    end
                end
                local id_count1 = 0
                for id in db1:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do
                    id_count1 = id
                end
                id_count1 = id_count1 + 1
                print(res_code,next_url.." record in INVALID-DB id:"..tostring(id_count1))
                print()
                local stmt2 = db1:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
            
                stmt2:bind_names{  id = id_count1,  url = next_url    }
                stmt2:step()
                stmt2:reset()
                stmt2:finalize()

            end
        end
        ::continue::
    end

    while record == true do
        for x = 1,6 do

            address_number = address_number + 1
            local next_url = "https://mangabank.org/"..tostring(address_number).."/"
            print()
            print("test:",next_url)
        
            smt1 = "SELECT id,url FROM url WHERE url=".."\""..next_url.."\""
            --print(smt1)
        
            local exist = false
            ---- VALID DB == db
            for id,url in db:urows(smt1) do
            -- print(id,url)
                exist = true
            end
            if exist == true then
                print(next_url.." is aleady exist in VALID-DB")
            else-- exist == false
                -- INVALID DB == db1
                for id,url in db1:urows(smt1) do
                    exist = true
                    print(next_url.." is aleady exist in INVALID-DB")
                    goto continue2 -- go to next loop
                end
                -- http request
                easy:setopt(curl.OPT_URL,next_url)
                easy:perform()
                res_code = easy:getinfo_response_code()
                
                if res_code == 200 then
                    record = true
                    --local id_count = 0
                    for id in db:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do
                        id_count = id
                    end
                    id_count = id_count + 1
                    print(res_code,next_url.." record in VALID-DB id:"..tostring(id_count))
                    print()
                    local stmt2 = db:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
                
                    stmt2:bind_names{  id = id_count,  url = next_url    }
                    stmt2:step()
                    stmt2:reset()
                    stmt2:finalize()
                    
                    ---- INVALID-DB
                    if remove == true then
                        for id,url in db1:urows(smt1) do
                            print(next_url.." is exist in INVALID-DB but now VAILD.")
                            local null_smt1 = "UPDATE url SET url='null' WHERE id="..id..";" 
                            assert(db1:exec(null_smt1))
                            goto continue2 -- go to next loop
                        end
                    end
                else
                    print(res_code,next_url.." is INVALID")
                    print()
                    record = false
        
                    ---- INVALID-DB
                    if remove == true then
                        for id,url in db1:urows(smt1) do
                            print(next_url.." is aleady exist in INVALID-DB")
                            goto continue2 -- go to next loop
                        end
                    end
                    --local id_count1 = 0
                    for id in db1:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do
                        id_count1 = id
                    end
                    id_count1 = id_count1 + 1
                    print(res_code,next_url.." record in INVALID-DB id:"..tostring(id_count1))
                    print()
                    local stmt2 = db1:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
                
                    stmt2:bind_names{  id = id_count1,  url = next_url    }
                    stmt2:step()
                    stmt2:reset()
                    stmt2:finalize()

                end
            end
            ::continue2::
        end
    end
end

easy:close()
end --[[big loop end]]--

db1:close()
db:close()

Lua 言語で軽いのと、わりと速いのですが数万件の https リクエストを処理するには時間がかかります。

複数のプロセスでリクエストして、あとから結果をマージすると速く多くのデータができます。 例えば、同じようなプログラムを他の言語(でなくてもいいですが)で書いて、同じような別のデータベースに記録するようにして、2つの結果のデータベースの中身を比べながら、どちらにも存在するデータ、どちらかにしかないデータをまた別のデータベースに記録していくと重複なくデータが抽出できます。 Ruby , python , Lua はよくにている言語なので、書き比べてみてはどうでしょうか。 マンガサイトを調査 実践。( python ) - 黒猫クックブック ちょっと毛色の違う Go 言語で書いてみると、ずいぶんと速かったので、http リクエストの実験として色々とやってみるのもいいかもしれません。

これは、釣り( fishing )です。 さびき釣りのようなイメージです。

ja.m.wikipedia.org

リクエストのリトライまでの間隔が短ければ、より速いので cURL を使い、さらに、リダイレクト先までフォローせずに、リクエストの戻ってきたヘッダーの中の location を読み取りリダイレクト先の URL を得ます。なるべくリクエストにかかる時間のオーバーヘッドを減らすということで速く釣りあげます。

2つのデータベースをマージするためのもの。Lua ( / Ruby )

local sqlite3 = require("lsqlite3")

--[[local db1 = sqlite3.open('lua_mb_urls_not.db') -- Lua
local db2 = sqlite3.open('mb_urls_not.db') -- other

local newdb = sqlite3.open('mb_url_notadditive.db') -- for working space
local newdb1 = sqlite3.open('lua_mb_url_not1.db') -- additive & order by url
]]

----> Ruby >
--db1 = SQLite3::Database.open "lua_mb_urls_not.db" # Lua
--db2 = SQLite3::Database.open "mb_urls_not.db" # made from python code
--newdb = SQLite3::Database.open "mb_url_notadditive.db" # additive work space
--newdb1 = SQLite3::Database.open "mb_url_not1.db" # additive & order by url
----< Ruby <

local db1 = sqlite3.open('lua_mb_urls_new.db') -- Lua
local db2 = sqlite3.open('mb_urls_new.db') -- other

local newdb = sqlite3.open('mb_url_additive.db') -- for working space
local newdb1 = sqlite3.open('lua_mb_url_new1.db')

newdb:exec[[
  CREATE TABLE IF NOT EXIST url (id INTEGER PRIMARY KEY, url text);
]]

newdb1:exec[[
  CREATE TABLE IF NOT EXIST url (id INTEGER PRIMARY KEY, url text);
]]

----> Ruby >
--require 'sqlite3'

--db1 = SQLite3::Database.open "lua_mb_urls_new.db" # Lua
--db2 = SQLite3::Database.open "mb_urls_new.db" # made from python code
--newdb = SQLite3::Database.open "mb_url_additive.db" # additive work space
--newdb1 = SQLite3::Database.open "mb_url_new1.db" # additive & order by url
--
--SQL =<<EOS
--create table IF NOT EXISTS url(
--    id INTEGER PRIMARY KEY,
--    url text
--    );
--EOS
--
--newdb.execute(SQL)
--newdb1.execute(SQL)
--
----< Ruby <

local smt1 = "SELECT id FROM url ORDER BY id DESC LIMIT 1 ;" 

local last1 = 0
local last2 = 0

for id in db1:urows(smt1) do
    last1 = id
end

for id in db2:urows(smt1) do
    last2 = id
end

print("db1 has "..last1.." URLs")
print("db2 has "..last2.." URLs")

----> Ruby >
--last1 = db1.execute("select id from url order by id desc limit 1")
--last2 = db2.execute("select id from url order by id desc limit 1")
--
--k = last1[0].pop # offset
--l = last2[0].pop
--
--puts "db1 has #{k} URLs"
--puts "db2 has #{l} URLs"
--
--i = 0
--
--newdb_last = newdb.execute("select id from url order by id desc limit 1")
--if !newdb_last.empty? then
--    newcount = newdb_last[0].pop # offset
--else
--    newcount = 0
--end
----< Ruby <

local newcount = 0
for last_id in newdb:urows(smt1) do
    newcount = last_id
end

local i = 0

for url_data in db2:urows("SELECT url FROM url ;") do
--[[
for url_data in db2:urows("SELECT url FROM url ORDER BY url ;") do
]]
    i = i + 1
    print("copy from db2 ")
    print(i)
    print("total "..last2)

    local existflag = 0

    for exist in newdb:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do
        existflag = exist
        print(url_data.." is aleady exist in DB / skip")
    end

    if existflag == 0 then -- record
        newcount = newcount + 1
        print()
        print("newdb : "..newcount.." "..url_data)
        local stmt2 = newdb:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
        stmt2:bind_names{  id = newcount,  url = url_data    }
        stmt2:step()
        stmt2:reset()
        stmt2:finalize()
    end
end

db2:close()

i = 0 -- reset

for url_data in db1:urows("SELECT url FROM url ;") do
--[[
for url_data in db1:urows("SELECT url FROM url ORDER BY url ;") do
]]
    i = i + 1
    print("copy from db1 ")
    print(i)
    print("total "..last1)

    local existflag = 0
    
    for exist in newdb:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do
        existflag = exist
        print(url_data.." is aleady exist in DB / skip")
    end

    if existflag == 0 then -- record
        newcount = newcount + 1
        print()
        print("newdb : "..newcount.." "..url_data)
        local stmt2 = newdb:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
        stmt2:bind_names{  id = newcount,  url = url_data    }
        stmt2:step()
        stmt2:reset()
        stmt2:finalize()
    end
end

db1:close()

----> Ruby >
--l.times do |index|
--    i += 1
--    res = db2.execute("select url from url where id='#{i}' ;")
--    if res.empty? then
--        puts "no data / skip"
--        next
--    end
--    x = res[0][0] # url
--    puts "copy from db2"
--    p i,l
--
--    res2 = newdb.execute("select url from url where url='#{x}' ;")
--    if !res2.empty? then
--        p  x
--        puts 'already exist in db / skip'
--      next
--    end
--    puts
--    newcount += 1
--    url = x
--    #url.gsub!(/\'/,"\'\'")
--    puts "#{newcount}  #{url}"
--    newdb.execute("insert into url (id, url) values( '#{newcount}','#{url}') ;")
--end
--
--db2.close
--
--i = 0
--k.times do |index|
--    i += 1
--    res = db1.execute("select url from url where id='#{i}' ;")
--    if res.empty? then
--        puts "no data / skip"
--        next
--    end
--    x = res[0][0] # url
--    f = newdb.execute("select url from url where url='#{x}' ;")
--    puts "copy from db1"
--    p i,k
--
--    if !f.empty? then
--        p res[0]
--        puts 'already exist in db / skip'
--        next
--    else
--        newcount += 1
--        url = x
--        #url.gsub!(/\'/,"\'\'")
--        puts "#{newcount}  #{url}"
--        newdb.execute("insert into url (id, url ) values( '#{newcount}','#{url}') ;")
--    end
--end
--
--db1.close
----< Ruby <

print("new DB last id : "..newcount)

----> Ruby >
--puts "new DB last id:#{newcount}"
----< Ruby <

local newdb1_last = 0
for last_id in newdb1:urows(smt1) do
    newdb1_last = last_id
end

local counter = newdb1_last -- first time ==> 0

----> Ruby > 
--newdb1_last = newdb1.execute("select id from url order by id desc limit 1")
--if !newdb1_last.empty? then
--    counter = newdb1_last[0].pop # offset
--else
--    counter = 0
--end
----< Ruby <

i = 0 -- reset

----> Ruby >
--i = 0
----< Ruby <

for url_data in newdb:urows("SELECT url FROM url ORDER BY url ;") do
    i = i + 1
    print("copy from newdb ")
    print(i)
    print("total "..newcount)

    local existflag = 0
    
    for exist in newdb1:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do
        existflag = exist
        print(url_data.." is aleady exist in DB / skip")
    end

    if existflag == 0 then -- record
        counter = counter + 1
        print()
        print("newdb1 : "..counter.." "..url_data)
        local stmt2 = newdb1:prepare[[ INSERT INTO url VALUES (:id, :url) ]]
        stmt2:bind_names{  id = counter,  url = url_data    }
        stmt2:step()
        stmt2:reset()
        stmt2:finalize()
    end
end

--[[
newdb1:close() 
newdb:close()
]] -- end --


----> Ruby >
--newdb.execute("select url from url order by url ;") do |row|
--    i += 1
--    url = row[0]
--    #url.gsub!(/\'/,"\'\'")
--    f = newdb1.execute("select url from url where url='#{url}' ;")
--    puts "copy from newdb"
--    p i,newcount
--
--    if !f.empty? then
--        p url
--        puts 'already exist in db / skip'
--        next
--    else
--        counter = counter + 1
--        puts "#{counter}  #{url}"
--        newdb1.execute("insert into url (id, url ) values( '#{counter}','#{url}') ;")
--    end
--end
--
--newdb1.close
--newdb.close
----< Ruby <