漫画BANK マンガサイトを調査 実践。( python )

kuroca.hatenablog.com

kuroca.hatenablog.com

kuroca.hatenablog.com

Lisp 風だと

hy 1.0a3 using CPython(default) 3.9.7 on Linux
=> (import requests)
=> (setv res (requests.get "https://mangabank.org/watch/?tour=/vol/" :allow_redirects False))
=> (type res)                                                                                      
<class 'requests.models.Response'> 
=> (get res.headers "location")
pip install pycurl
import pycurl

c = pycurl.Curl()

#c.setopt(c.URL, 'https://mangabank.org/watch/?tour=/vol/')
## Follow redirect.
#c.setopt(c.FOLLOWLOCATION, 1) # True: 1 / False : 0
#for x in range(1):
#    c.perform()
#    #print(c.getinfo(c.REDIRECT_URL))
#    print(c.getinfo(c.EFFECTIVE_URL))
#
#c.close()

#import requests
import sqlite3


SQL = """
create table url(
    id int primary key,
    url text
    );
"""

## first time only
db = sqlite3.connect('mb_urls_not.db')
db.execute(SQL)
db.close()
db = sqlite3.connect('mb_urls_new.db')
db.execute(SQL)
db.close()


con1 = sqlite3.connect('mb_urls_not.db')
cur1 = con1.cursor()
con = sqlite3.connect('mb_urls_new.db')
cur = con.cursor()

last_id1 = cur1.execute("select id from url order by id desc limit 1;")

index_num1 = last_id1.fetchone()
if index_num1 != None:
    count1 = int(*index_num1)
else:
    count1 = 0

last_id = cur.execute("select id from url order by id desc limit 1;")

index_num = last_id.fetchone()
if index_num != None:
    count = int(*index_num)
else:
    count = 0

from urllib.parse import urlparse
import re

c.setopt(c.WRITEFUNCTION, lambda bytes: len(bytes))

for i in range(10000):
    # Follow redirect.
    c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0
    c.setopt(c.URL, 'https://mangabank.org/watch/?tour=/vol/')
    c.perform()
    #res = c.getinfo(c.EFFECTIVE_URL)
    res = c.getinfo(c.REDIRECT_URL)
    address = res
    number = re.sub(r'\/','',urlparse(res).path)

    print()
    print("      " + number)
##    number = int(number) - 10
##    address = 'https://mangabank.org/'+str(number)+'/'

    def dbr(c,address,number,cur1,count1,cur,count,second):
        gate = False
        for x in range(11):
            print(x,end='  ')
            again = False 
            ###### <1 check invalid db 
            if x == 0:
                flag1 = None
                flag1 = cur1.execute("select id from url where url=? ;",[address])

                flagcheck = flag1.fetchone()
                if flagcheck != None:
                    print("flag 1: id=",*flagcheck,address)
                    cur1.execute("update url set url=?  where id=? ;",["null",*flagcheck])
                    #cur1.commit()
                    print(address,"is VALID / remove from NOT-VALID")
                    gate = True
                if second != True:
                    number = int(number) - 10
                    address = 'https://mangabank.org/'+str(number)+'/'
                    
            ###### 1>
            if x != 0:
                number = int(number) + 1
                address = 'https://mangabank.org/'+str(number)+'/'
#               print(address)
                ###### <2
                flag2 = None
                flag2 = cur1.execute("select id from url where url=? ;",[address])

                flagcheck = flag2.fetchone()
                if flagcheck != None:
                    print()
                    print("flag 2: id= ",*flagcheck,address)
                    print(number,"NOT-VALID")
                    if gate != True:
                        continue
                ###### 2>
                c.setopt(c.URL,address)
                c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0
                c.perform()
                print(address,"responce code: ", c.getinfo(c.HTTP_CODE))
                if c.getinfo(c.HTTP_CODE) != 200:
                    count1 += 1
                    print("insert NOT-VALID-DB id: ",count1," -> ",address)
                    cur1.execute("insert into url (id,url) values(?,?) ;",[count1,address])
                    con1.commit()
                    if second == True:
                        break
                    continue
                elif c.getinfo(c.HTTP_CODE) == 200:
                    if flagcheck != None:
                        print()
                        print("flag 2-1: id=",*flagcheck,address)
                        cur1.execute("update url set url=?  where id=? ;",["null",*flagcheck])
                        #cur1.commit()
                        print(address,"is VALID / remove from NOT-VALID")



            flag = None
            flag = cur.execute("select id from url where url=? ;",[address])

            flagcheck = flag.fetchone()
            if flagcheck != None:
                print()
                print("flag 0: id= ",*flagcheck,address)
                print("already exsist in DB")
                print()
                break
            else:
                if x == 0 :
                    c.setopt(c.URL,address)
                    c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0
                    c.perform()
                    print(address,"responce code: ", c.getinfo(c.HTTP_CODE))

                if c.getinfo(c.HTTP_CODE) == 200:
                    count += 1
                    print("insert id: ",count," -> ",address)
                    cur.execute("insert into url (id,url) values(?,?) ;",[count,address])
                    con.commit()
                    again = True
                    continue
                    #break 
                elif c.getinfo(c.HTTP_CODE) != 200:
                    count1 += 1
                    print("insert NOT-VALID-DB id: ",count1," -> ",address)
                    cur1.execute("insert into url (id,url) values(?,?) ;",[count1,address])
                    con1.commit()
        return count1,count,address,again

    second = False
    count1,count,address,again = dbr(c,address,number,cur1,count1,cur,count,second)
    
    print()
    limit = 5
    while again == True:
        print("      ", str(limit))
        number = re.sub(r'\/','',urlparse(address).path)
        number = int(number) + 1
        address = 'https://mangabank.org/'+str(number)+'/'
        c.setopt(c.URL,address)
        c.setopt(c.FOLLOWLOCATION, 0) # True: 1 / False : 0
        c.perform()
        print(address,"responce code: ", c.getinfo(c.HTTP_CODE))
        if c.getinfo(c.HTTP_CODE) == 200:
            second = True
            count1,count,address,again = dbr(c,address,number,cur1,count1,cur,count,second)
            limit = 5
        else:
            limit -= 1
        print("  retry ", str(limit))
        ###### <3
            flag1 = None
            flag1 = cur1.execute("select id from url where url=? ;",[address])

            flagcheck = flag1.fetchone()
            if flagcheck != None:
                print("flag 3: id=",*flagcheck,address)
                print( number,"NOT-VALID")
                
                if limit < 1:
                    again = False

                continue
        ###### 3>
            count1 += 1
            print("insert NOT-VALID-DB id: ",count1," -> ",address)
            cur1.execute("insert into url (id,url) values(?,?) ;",[count1,address])
            con1.commit()
            if limit < 1:
                again = False
c.close()
con1.close()
con.close()