漫画BANK マンガサイトを調査 実践。( python )
Lisp 風だと
hy 1.0a3 using CPython(default) 3.9.7 on Linux => (import requests) => (setv res (requests.get "https://mangabank.org/watch/?tour=/vol/" :allow_redirects False)) => (type res) <class 'requests.models.Response'> => (get res.headers "location")
pip install pycurl
import pycurl c = pycurl.Curl() #c.setopt(c.URL, 'https://mangabank.org/watch/?tour=/vol/') ## Follow redirect. #c.setopt(c.FOLLOWLOCATION, 1) # True: 1 / False : 0 #for x in range(1): # c.perform() # #print(c.getinfo(c.REDIRECT_URL)) # print(c.getinfo(c.EFFECTIVE_URL)) # #c.close() #import requests import sqlite3 SQL = """ create table url( id int primary key, url text ); """ ## first time only db = sqlite3.connect('mb_urls_not.db') db.execute(SQL) db.close() db = sqlite3.connect('mb_urls_new.db') db.execute(SQL) db.close() con1 = sqlite3.connect('mb_urls_not.db') cur1 = con1.cursor() con = sqlite3.connect('mb_urls_new.db') cur = con.cursor() last_id1 = cur1.execute("select id from url order by id desc limit 1;") index_num1 = last_id1.fetchone() if index_num1 != None: count1 = int(*index_num1) else: count1 = 0 last_id = cur.execute("select id from url order by id desc limit 1;") index_num = last_id.fetchone() if index_num != None: count = int(*index_num) else: count = 0 from urllib.parse import urlparse import re c.setopt(c.WRITEFUNCTION, lambda bytes: len(bytes)) for i in range(10000): # Follow redirect. c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0 c.setopt(c.URL, 'https://mangabank.org/watch/?tour=/vol/') c.perform() #res = c.getinfo(c.EFFECTIVE_URL) res = c.getinfo(c.REDIRECT_URL) address = res number = re.sub(r'\/','',urlparse(res).path) print() print(" " + number) ## number = int(number) - 10 ## address = 'https://mangabank.org/'+str(number)+'/' def dbr(c,address,number,cur1,count1,cur,count,second): gate = False for x in range(11): print(x,end=' ') again = False ###### <1 check invalid db if x == 0: flag1 = None flag1 = cur1.execute("select id from url where url=? ;",[address]) flagcheck = flag1.fetchone() if flagcheck != None: print("flag 1: id=",*flagcheck,address) cur1.execute("update url set url=? where id=? ;",["null",*flagcheck]) #cur1.commit() print(address,"is VALID / remove from NOT-VALID") gate = True if second != True: number = int(number) - 10 address = 'https://mangabank.org/'+str(number)+'/' ###### 1> if x != 0: number = int(number) + 1 address = 'https://mangabank.org/'+str(number)+'/' # print(address) ###### <2 flag2 = None flag2 = cur1.execute("select id from url where url=? ;",[address]) flagcheck = flag2.fetchone() if flagcheck != None: print() print("flag 2: id= ",*flagcheck,address) print(number,"NOT-VALID") if gate != True: continue ###### 2> c.setopt(c.URL,address) c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0 c.perform() print(address,"responce code: ", c.getinfo(c.HTTP_CODE)) if c.getinfo(c.HTTP_CODE) != 200: count1 += 1 print("insert NOT-VALID-DB id: ",count1," -> ",address) cur1.execute("insert into url (id,url) values(?,?) ;",[count1,address]) con1.commit() if second == True: break continue elif c.getinfo(c.HTTP_CODE) == 200: if flagcheck != None: print() print("flag 2-1: id=",*flagcheck,address) cur1.execute("update url set url=? where id=? ;",["null",*flagcheck]) #cur1.commit() print(address,"is VALID / remove from NOT-VALID") flag = None flag = cur.execute("select id from url where url=? ;",[address]) flagcheck = flag.fetchone() if flagcheck != None: print() print("flag 0: id= ",*flagcheck,address) print("already exsist in DB") print() break else: if x == 0 : c.setopt(c.URL,address) c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0 c.perform() print(address,"responce code: ", c.getinfo(c.HTTP_CODE)) if c.getinfo(c.HTTP_CODE) == 200: count += 1 print("insert id: ",count," -> ",address) cur.execute("insert into url (id,url) values(?,?) ;",[count,address]) con.commit() again = True continue #break elif c.getinfo(c.HTTP_CODE) != 200: count1 += 1 print("insert NOT-VALID-DB id: ",count1," -> ",address) cur1.execute("insert into url (id,url) values(?,?) ;",[count1,address]) con1.commit() return count1,count,address,again second = False count1,count,address,again = dbr(c,address,number,cur1,count1,cur,count,second) print() limit = 5 while again == True: print(" ", str(limit)) number = re.sub(r'\/','',urlparse(address).path) number = int(number) + 1 address = 'https://mangabank.org/'+str(number)+'/' c.setopt(c.URL,address) c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0 c.perform() print(address,"responce code: ", c.getinfo(c.HTTP_CODE)) if c.getinfo(c.HTTP_CODE) == 200: second = True count1,count,address,again = dbr(c,address,number,cur1,count1,cur,count,second) limit = 5 else: limit -= 1 print(" retry ", str(limit)) ###### <3 flag1 = None flag1 = cur1.execute("select id from url where url=? ;",[address]) flagcheck = flag1.fetchone() if flagcheck != None: print("flag 3: id=",*flagcheck,address) print( number,"NOT-VALID") if limit < 1: again = False continue ###### 3> count1 += 1 print("insert NOT-VALID-DB id: ",count1," -> ",address) cur1.execute("insert into url (id,url) values(?,?) ;",[count1,address]) con1.commit() if limit < 1: again = False c.close() con1.close() con.close()