漫画Bank マンガサイトを調査 実践。
有効な URL と 有効ではない URL を調査し、データベースに記録しよう。
Lua 言語で cURL のライブラリを使って https リクエストして調査してみましょう。
Python 言語の場合はこっちにあります。
curl -is "https://mangabank.org/watch/?tour=/vol" | grep location
これは、上記のターミナルコマンドを実行してリダイレクトされるアドレスを記録していくプログラムです。 ただ、一度噛みつくと深く掘り返すようになっています。
local sqlite3 = require("lsqlite3") local curl = require('cURL') local db = sqlite3.open('lua_mb_urls_new.db') local db1 = sqlite3.open('lua_mb_urls_not.db') db:exec[[ CREATE TABLE url (id INTEGER PRIMARY KEY, url); ]] db1:exec[[ CREATE TABLE url (id INTEGER PRIMARY KEY, url); ]] --[[ lsqlite3 usage ]]-- --for row in db:nrows("SELECT * FROM url") do -- print(row.id, row.url) --end --for row in db:rows("SELECT * FROM url") do -- print(row[1], row[2]) --end --for id,url in db:urows("SELECT * FROM url") do -- print(id, url) --end local id_count = 0 for bigloop=1,1000 do local easy = curl.easy{ url = "https://mangabank.org/watch/?tour=/vol", --followlocation = true, -- true: 1 followlocation = 0, -- false: 0 maxredirs = 2, [curl.OPT_VERBOSE] = 0, } local buffer = {} easy:setopt_writefunction(table.insert, buffer) ----------------------------------------------------------------- --[[ hettp request ]]-- easy:perform() local res_code = easy:getinfo_response_code() local res_location = easy:getinfo_redirect_url() -- 'URL'... redirect to print() --print("code:",res_code) -- always 301 not 200 print("redirect URL:",res_location) local address_code = res_location:match("%d+") -- "[0-9]+" string local address_number = tonumber(address_code) --[[ example: 'https://mangabank.org/2457593/' --]] local smt1 = "SELECT id,url FROM url WHERE url=".."\""..res_location.."\"" --print(smt1) local exist = false ---- VALID DB for id,url in db:urows(smt1) do -- print(id,url) exist = true end local gate = false local remove = false if exist == true then print(res_location.." is aleady exist in VALID-DB") else-- exist == false gate = true ---- INVALID DB for id,url in db1:urows(smt1) do remove = true print(res_location.." is VALID.remove from INVALID-DB") local null_smt1 = "UPDATE url SET url='null' WHERE id="..id..";" assert(db1:exec(null_smt1)) end local id_count = 0 for id in db:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do id_count = id end id_count = id_count + 1 print(res_location.." record in VALID-DB id:"..tostring(id_count)) local stmt2 = db:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = id_count, url = res_location } stmt2:step() stmt2:reset() stmt2:finalize() end ----------------------------------------------------------------- if gate == true then address_number = address_number - 11 local record = false for i=1,20 do print("VALID DB DATA size :"..id_count) address_number = address_number + 1 local next_url = "https://mangabank.org/"..tostring(address_number).."/" print() print("test:",next_url) smt1 = "SELECT id,url FROM url WHERE url=".."\""..next_url.."\"" --print(smt1) local exist = false ---- VALID DB == db for id,url in db:urows(smt1) do -- print(id,url) exist = true end if exist == true then print(next_url.." is aleady exist in VALID-DB") else-- exist == false -- INVALID DB == db1 for id,url in db1:urows(smt1) do exist = true if remove ~= true then print(next_url.." is aleady exist in INVALID-DB") goto continue -- go to next loop end end -- http request if nexe_url == res_location then res_code = 200 goto recording -- skip http request end easy:setopt(curl.OPT_URL,next_url) easy:perform() res_code = easy:getinfo_response_code() ::recording:: if res_code == 200 then record = true --local id_count = 0 for id in db:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do id_count = id end id_count = id_count + 1 print(res_code,next_url.." record in VALID-DB id:"..tostring(id_count)) print() local stmt2 = db:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = id_count, url = next_url } stmt2:step() stmt2:reset() stmt2:finalize() else print(res_code,next_url.." is INVALID") print() record = false ---- INVALID-DB if remove == true then for id,url in db1:urows(smt1) do print(next_url.." is aleady exist in INVALID-DB") goto continue -- go to next loop end end local id_count1 = 0 for id in db1:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do id_count1 = id end id_count1 = id_count1 + 1 print(res_code,next_url.." record in INVALID-DB id:"..tostring(id_count1)) print() local stmt2 = db1:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = id_count1, url = next_url } stmt2:step() stmt2:reset() stmt2:finalize() end end ::continue:: end while record == true do for x = 1,6 do address_number = address_number + 1 local next_url = "https://mangabank.org/"..tostring(address_number).."/" print() print("test:",next_url) smt1 = "SELECT id,url FROM url WHERE url=".."\""..next_url.."\"" --print(smt1) local exist = false ---- VALID DB == db for id,url in db:urows(smt1) do -- print(id,url) exist = true end if exist == true then print(next_url.." is aleady exist in VALID-DB") else-- exist == false -- INVALID DB == db1 for id,url in db1:urows(smt1) do exist = true print(next_url.." is aleady exist in INVALID-DB") goto continue2 -- go to next loop end -- http request easy:setopt(curl.OPT_URL,next_url) easy:perform() res_code = easy:getinfo_response_code() if res_code == 200 then record = true --local id_count = 0 for id in db:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do id_count = id end id_count = id_count + 1 print(res_code,next_url.." record in VALID-DB id:"..tostring(id_count)) print() local stmt2 = db:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = id_count, url = next_url } stmt2:step() stmt2:reset() stmt2:finalize() ---- INVALID-DB if remove == true then for id,url in db1:urows(smt1) do print(next_url.." is exist in INVALID-DB but now VAILD.") local null_smt1 = "UPDATE url SET url='null' WHERE id="..id..";" assert(db1:exec(null_smt1)) goto continue2 -- go to next loop end end else print(res_code,next_url.." is INVALID") print() record = false ---- INVALID-DB if remove == true then for id,url in db1:urows(smt1) do print(next_url.." is aleady exist in INVALID-DB") goto continue2 -- go to next loop end end --local id_count1 = 0 for id in db1:urows("SELECT id FROM url ORDER BY id DESC LIMIT 1;") do id_count1 = id end id_count1 = id_count1 + 1 print(res_code,next_url.." record in INVALID-DB id:"..tostring(id_count1)) print() local stmt2 = db1:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = id_count1, url = next_url } stmt2:step() stmt2:reset() stmt2:finalize() end end ::continue2:: end end end easy:close() end --[[big loop end]]-- db1:close() db:close()
Lua 言語で軽いのと、わりと速いのですが数万件の https リクエストを処理するには時間がかかります。
複数のプロセスでリクエストして、あとから結果をマージすると速く多くのデータができます。 例えば、同じようなプログラムを他の言語(でなくてもいいですが)で書いて、同じような別のデータベースに記録するようにして、2つの結果のデータベースの中身を比べながら、どちらにも存在するデータ、どちらかにしかないデータをまた別のデータベースに記録していくと重複なくデータが抽出できます。 Ruby , python , Lua はよくにている言語なので、書き比べてみてはどうでしょうか。 マンガサイトを調査 実践。( python ) - 黒猫クックブック ちょっと毛色の違う Go 言語で書いてみると、ずいぶんと速かったので、http リクエストの実験として色々とやってみるのもいいかもしれません。
これは、釣り( fishing )です。 さびき釣りのようなイメージです。
リクエストのリトライまでの間隔が短ければ、より速いので cURL を使い、さらに、リダイレクト先までフォローせずに、リクエストの戻ってきたヘッダーの中の location を読み取りリダイレクト先の URL を得ます。なるべくリクエストにかかる時間のオーバーヘッドを減らすということで速く釣りあげます。
2つのデータベースをマージするためのもの。Lua ( / Ruby )
local sqlite3 = require("lsqlite3") --[[local db1 = sqlite3.open('lua_mb_urls_not.db') -- Lua local db2 = sqlite3.open('mb_urls_not.db') -- other local newdb = sqlite3.open('mb_url_notadditive.db') -- for working space local newdb1 = sqlite3.open('lua_mb_url_not1.db') -- additive & order by url ]] ----> Ruby > --db1 = SQLite3::Database.open "lua_mb_urls_not.db" # Lua --db2 = SQLite3::Database.open "mb_urls_not.db" # made from python code --newdb = SQLite3::Database.open "mb_url_notadditive.db" # additive work space --newdb1 = SQLite3::Database.open "mb_url_not1.db" # additive & order by url ----< Ruby < local db1 = sqlite3.open('lua_mb_urls_new.db') -- Lua local db2 = sqlite3.open('mb_urls_new.db') -- other local newdb = sqlite3.open('mb_url_additive.db') -- for working space local newdb1 = sqlite3.open('lua_mb_url_new1.db') newdb:exec[[ CREATE TABLE IF NOT EXIST url (id INTEGER PRIMARY KEY, url text); ]] newdb1:exec[[ CREATE TABLE IF NOT EXIST url (id INTEGER PRIMARY KEY, url text); ]] ----> Ruby > --require 'sqlite3' --db1 = SQLite3::Database.open "lua_mb_urls_new.db" # Lua --db2 = SQLite3::Database.open "mb_urls_new.db" # made from python code --newdb = SQLite3::Database.open "mb_url_additive.db" # additive work space --newdb1 = SQLite3::Database.open "mb_url_new1.db" # additive & order by url -- --SQL =<<EOS --create table IF NOT EXISTS url( -- id INTEGER PRIMARY KEY, -- url text -- ); --EOS -- --newdb.execute(SQL) --newdb1.execute(SQL) -- ----< Ruby < local smt1 = "SELECT id FROM url ORDER BY id DESC LIMIT 1 ;" local last1 = 0 local last2 = 0 for id in db1:urows(smt1) do last1 = id end for id in db2:urows(smt1) do last2 = id end print("db1 has "..last1.." URLs") print("db2 has "..last2.." URLs") ----> Ruby > --last1 = db1.execute("select id from url order by id desc limit 1") --last2 = db2.execute("select id from url order by id desc limit 1") -- --k = last1[0].pop # offset --l = last2[0].pop -- --puts "db1 has #{k} URLs" --puts "db2 has #{l} URLs" -- --i = 0 -- --newdb_last = newdb.execute("select id from url order by id desc limit 1") --if !newdb_last.empty? then -- newcount = newdb_last[0].pop # offset --else -- newcount = 0 --end ----< Ruby < local newcount = 0 for last_id in newdb:urows(smt1) do newcount = last_id end local i = 0 for url_data in db2:urows("SELECT url FROM url ;") do --[[ for url_data in db2:urows("SELECT url FROM url ORDER BY url ;") do ]] i = i + 1 print("copy from db2 ") print(i) print("total "..last2) local existflag = 0 for exist in newdb:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do existflag = exist print(url_data.." is aleady exist in DB / skip") end if existflag == 0 then -- record newcount = newcount + 1 print() print("newdb : "..newcount.." "..url_data) local stmt2 = newdb:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = newcount, url = url_data } stmt2:step() stmt2:reset() stmt2:finalize() end end db2:close() i = 0 -- reset for url_data in db1:urows("SELECT url FROM url ;") do --[[ for url_data in db1:urows("SELECT url FROM url ORDER BY url ;") do ]] i = i + 1 print("copy from db1 ") print(i) print("total "..last1) local existflag = 0 for exist in newdb:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do existflag = exist print(url_data.." is aleady exist in DB / skip") end if existflag == 0 then -- record newcount = newcount + 1 print() print("newdb : "..newcount.." "..url_data) local stmt2 = newdb:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = newcount, url = url_data } stmt2:step() stmt2:reset() stmt2:finalize() end end db1:close() ----> Ruby > --l.times do |index| -- i += 1 -- res = db2.execute("select url from url where id='#{i}' ;") -- if res.empty? then -- puts "no data / skip" -- next -- end -- x = res[0][0] # url -- puts "copy from db2" -- p i,l -- -- res2 = newdb.execute("select url from url where url='#{x}' ;") -- if !res2.empty? then -- p x -- puts 'already exist in db / skip' -- next -- end -- puts -- newcount += 1 -- url = x -- #url.gsub!(/\'/,"\'\'") -- puts "#{newcount} #{url}" -- newdb.execute("insert into url (id, url) values( '#{newcount}','#{url}') ;") --end -- --db2.close -- --i = 0 --k.times do |index| -- i += 1 -- res = db1.execute("select url from url where id='#{i}' ;") -- if res.empty? then -- puts "no data / skip" -- next -- end -- x = res[0][0] # url -- f = newdb.execute("select url from url where url='#{x}' ;") -- puts "copy from db1" -- p i,k -- -- if !f.empty? then -- p res[0] -- puts 'already exist in db / skip' -- next -- else -- newcount += 1 -- url = x -- #url.gsub!(/\'/,"\'\'") -- puts "#{newcount} #{url}" -- newdb.execute("insert into url (id, url ) values( '#{newcount}','#{url}') ;") -- end --end -- --db1.close ----< Ruby < print("new DB last id : "..newcount) ----> Ruby > --puts "new DB last id:#{newcount}" ----< Ruby < local newdb1_last = 0 for last_id in newdb1:urows(smt1) do newdb1_last = last_id end local counter = newdb1_last -- first time ==> 0 ----> Ruby > --newdb1_last = newdb1.execute("select id from url order by id desc limit 1") --if !newdb1_last.empty? then -- counter = newdb1_last[0].pop # offset --else -- counter = 0 --end ----< Ruby < i = 0 -- reset ----> Ruby > --i = 0 ----< Ruby < for url_data in newdb:urows("SELECT url FROM url ORDER BY url ;") do i = i + 1 print("copy from newdb ") print(i) print("total "..newcount) local existflag = 0 for exist in newdb1:urows("SELECT id FROM url WHERE url=\""..url_data.."\" ;") do existflag = exist print(url_data.." is aleady exist in DB / skip") end if existflag == 0 then -- record counter = counter + 1 print() print("newdb1 : "..counter.." "..url_data) local stmt2 = newdb1:prepare[[ INSERT INTO url VALUES (:id, :url) ]] stmt2:bind_names{ id = counter, url = url_data } stmt2:step() stmt2:reset() stmt2:finalize() end end --[[ newdb1:close() newdb:close() ]] -- end -- ----> Ruby > --newdb.execute("select url from url order by url ;") do |row| -- i += 1 -- url = row[0] -- #url.gsub!(/\'/,"\'\'") -- f = newdb1.execute("select url from url where url='#{url}' ;") -- puts "copy from newdb" -- p i,newcount -- -- if !f.empty? then -- p url -- puts 'already exist in db / skip' -- next -- else -- counter = counter + 1 -- puts "#{counter} #{url}" -- newdb1.execute("insert into url (id, url ) values( '#{counter}','#{url}') ;") -- end --end -- --newdb1.close --newdb.close ----< Ruby <