python 调用webdriver 爬取JS动态解密加载的网页数据

有些网站用JS 动态解密数据，生成页面，这种直接用Requests库获取数据有些麻烦，但是一向简单粗暴的Python为我们提供了selenium库，可以直接操作浏览器，可见就可得，下面将用一实例体现。
#  
from bs4 import BeautifulSoup
from PIL import Image
import requests,pymysql,time,re,random,configparser,os,datetime,sys
from selenium import webdriver


def trim(s):
    r = re.findall('[\S]+', s)
    return " ".join(r)

class mytool:
    ip="127.0.0.1"
    user="admin"
    passwd="admin"
    database="yyy"
    idlist=[15,975,978,991,993,994,995,996,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020]
    alltypelist=[]
    curIndex=0
    browser =  None
    conn=None
    def __init__(self):
        self.conn = pymysql.connect(self.ip,self.user,self.passwd)
        self.alltypelist=self.getallservertype3()
        self.browser=webdriver.Edge()        
    def __del__(self):
        self.conn.close()    
    def getallservertype3(self):  
        rt=[]        
        self.conn.select_db(self.database)
        cur=self.conn.cursor()#获取游标
        cur.execute("select * from yjcode_servertype WHERE admin=3")
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt.append(res)
        cur.close()
        self.conn.commit()
        
        return rt
    def getrandomuserid(self):
        i=random.randint(0,len(self.idlist)-1)
        return self.idlist[i]          
    def getservertype12id(self,type1name,type2name):
        rt1=0
        rt2=0     
       
        self.conn.select_db(self.database)      

        cur=self.conn.cursor()#获取游标
        cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='"+type2name +"' AND name3='' ; " )
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt2=res[0]
        cur.close()
        self.conn.commit()

        cur=self.conn.cursor()#获取游标
        cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='' AND name3='' ; " )
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt1=res[0]
        cur.close()
        self.conn.commit()


        
        return (rt1,rt2)

    def checkisexist(self,tit):
        rt=0
        
        self.conn.select_db(self.database)
        cur=self.conn.cursor()#获取游标
        cur.execute("select id from yjcode_server WHERE tit='"+tit+"';")
        while 1:
            res=cur.fetchone()
            if res is None:
                #表示已经取完结果集
                break
            rt=res[0] 
        cur.close()
        self.conn.commit()
       
        return rt
    def insertServerdata(self,userid,bh,ty1id,ty2id,ty3id,tit,txt,money):      
        if userid<1 or len(txt)<1 or len(tit)<1 or len(money)<1:
            print("parament is null")
            return False
        if self.checkisexist(tit)>0:
            print("is exist")
            return False
        dtime=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        con = pymysql.connect(host=self.ip,database=self.database, charset='utf8',user=self.user, password=self.passwd)
               
 
        with con.cursor() as cursor:               
            result = cursor.execute(
                'INSERT INTO yjcode_server (userid,bh,ty1id,ty2id,ty3id,zt,sj,lastsj,tit,txt,xsnum,money1,ifxj) \
                                    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s )' ,
                (userid,bh,ty1id,ty2id,ty3id,'0',dtime,dtime,tit,txt,'0',money,'0')
            )                
        if result == 1:
            print(tit+'添加成功!')   
        else:
            print("fail")            
        con.commit()
    
        
    
        con.close()
        return True
    
    def getsrcname(self,src):
        new_imgscrobj=re.finditer(r"/[A-Za-z0-9]+\.(gif|png|jpg|bmp)$",src)
        new_imgscr=""
        for n in new_imgscrobj:
            new_imgscr=str(n.group())
        return  new_imgscr
    def getburimg(self,filepath) :
        soup = BeautifulSoup(open(filepath,'r',encoding='UTF-8'))
        #print (soup.prettify())
        huzhanurl=soup.find('a',id='huzhanurl').get_text()
        os.system("start D:\\Users\\aihao\\webimg.exe "+huzhanurl)
    def downloadimg(self,imgsrc,savepath):
        #print(imgsrc,savepath)
        r=requests.get(url=imgsrc)
        with open(savepath, 'wb') as file:
            file.write(r.content)
        print('dwonlad img complete')
        time.sleep(5)
        #file.close()
    def createthumbnail(self,imgsrc):
        newimg1=imgsrc.replace(".","-1.")
        print(sys.platform)
        if sys.platform.find("win")>=0:
            sysstr="copy "+imgsrc+" "+newimg1
            os.system(sysstr)
        elif  sys.platform.find("linux")>=0:
            sysstr="cp "+imgsrc+" "+newimg1
            os.system(sysstr)    
        im=Image.open(imgsrc)   
        imgsize=im.size 
        imgth=0.3    
        im.thumbnail((imgsize[0]*imgth,imgsize[1]*imgth))
        #print(im.format,im.size,im.mode)
        newimg2=imgsrc.replace(".","-2.")
        im.save(newimg2,'JPEG')
    def downloadpage(self,index):
        #https://www.xxx.com/code/page/21
        print(index)
        os.system("start msedge https://www.xxx.com/code/page/"+str(index))

    def office368(self,pagestr,userid):
        #办公采购
        #https://www.xxx.com/category.php?id=67
        thisurl="https://www.xxx.com/category.php?id="+str(pagestr)
        r=requests.get(url=thisurl)
        r.encoding='utf-8'
        #print(r.text)
        list_id=[]
        list_img=[]
        m_index=0
        soup = BeautifulSoup(r.text,'html.parser')
        ul=soup.find('ul',class_='list-grid clearfix')
        if not ul:
            print("ul error ")
            return
        lis=ul.find_all('li',class_='item')
        for li in lis:
            href=str(li.find('a').get('href'))
            list_id.append(href)
            imgsrc=str(li.find('img').get('data-original'))
            list_img.append(imgsrc)
            #print(href,imgsrc)
            #breakIndentationError: unindent does not match any outer indentation level
        for viewid in list_id:
            m_index+=1            
            goodsidurl="https://www.xxx.com/"+viewid
            thisurl=goodsidurl
            r=requests.get(goodsidurl)
            r.encoding='utf-8'
            time.sleep(3)
            print( time.asctime( time.localtime(time.time()) ))
            soup = BeautifulSoup(r.text,'html.parser')

            tit=soup.find('div',class_='goods-name').get_text()
            #print(tit)
            #print("..")
            money=str(soup.find('font',class_='market-price').get_text()).replace("¥","").replace(",","")
            #print(money)
            #print(tit.get_text())
            #txt_o=soup.find('div',id='bqdiv1')
            #imgs=txt_o.find_all('img')
            #for img in imgs:
            #    imgscr=str(img.get('src'))
            #    if len(imgscr)>1 and -1==imgscr.find('http') :
            #        img['src']="https://www.xxx.com"+imgscr

            txt=str(soup.find('div',class_='right-con')).replace('阳光易购',"")

            #time.sleep(10000000)
            if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                i=list_id.index(viewid)
                t_imgsrc=list_img[i]
                k=t_imgsrc.rfind('/')+1
                #https://img.xxx.com/imgextra/i4/682114580/TB1IIsEXyqAXuNjy1XdXXaYcVXa_!!0-item_pic.jpg_300x300.jpg
                bhname=t_imgsrc[k:].replace('-','').replace('_!!',"").replace(".jpg_","").replace(".png_","")
                bh=str(int(time.time()))+"-"+str(userid);
                xsnum=0
                if(1==random.randint(0,50)):
                    xsnum=random.randint(1,10)
                url = "http://xxxx/zhf/savedata.php"
                data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname,"ty1id":39,"ty2id":0,"ty3id":0}
                res = requests.post(url=url,data=data)
                print(res.text)
                if(res.text.find("success")>1):
                    f_path='d:\\upload\\bh\\'+bhname
                    inc=0
                    while((not os.path.isfile(f_path)) and inc<3 ):
                        inc+=1
                        if not re.match(r"^http",t_imgsrc):
                            t_imgsrc="https://www.xxx.com"+t_imgsrc
                        print("download img->"+t_imgsrc+" -> "+f_path)
                        self.downloadimg(t_imgsrc,f_path)
                        time.sleep(5)
                    if  os.path.isfile(f_path):
                        self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                    time.sleep(5)

                    # os.rename(filepath,filepath.replace(".html",".xxxx"))
                elif (res.text.find("isexist!")>=0):
                    print(thisurl)

                if m_index>3:
                    print("next page")
                    time.sleep(5)
                    break


    def uploadfile(self,filename,filePath,userid,bh) :
        url="http://xxx/zhf/uploadfile.php"
        files = {'file': (filename, open(filePath, 'rb'),'image/jpeg')}
        data={"userid":userid,"bh":bh}
        r= requests.post(url, data=data, files=files)
        #print(requests.Request('POST', url,files=files).prepare().body.decode('ascii'))  # 鎵撳嵃锟????????????锟藉悕鍜岋拷???????????????????
        print(r.text)

    def ai_getty1id(self,tit):     
        if re.search(r'(网站模板)|(网站源码)',tit):
            return 37
        elif re.search(r'(品牌设)',tit):
            return 152    


    def haozhan(self,page,userid):
        #http://www.xxx.com/code?page=1355
        url="http://www.xxx.com/code?page="+str(page)
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='utf-8'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            div=soup.find('div',class_="list_items" )
            #print (div.get_text())
            dts=div.find_all('dt')
            for dt in dts:
                a=dt.find('a')
                #print(a.get('href'))
                list_id.append(str(a.get('href')))
                #print(bhname)
                #imga=ul.find('img')
                #list_img.append(str(imga.get('src')))
                #print(imga)
                #print(imga.get('src'))

                #print("-->"+li.get_text())
        except AttributeError:
            pass

        for viewid in list_id:
            #print(goodsidurl)
            try:
                goodsidurl="http://www.xxx.com"+viewid
                #print(goodsidurl)
                r=requests.get(goodsidurl)
                r.encoding='utf-8'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl


                tit=soup.find('span',class_='cate').get_text()

                print("-->")
                print(tit)
                time.sleep(180)

                break
                #print("..")
                money=str(soup.find('span',id='nowmoney').get_text())
                #print(money)
                #print(tit.get_text())
                txt_o=soup.find('div',id='bqdiv1')
                imgs=txt_o.find_all('img')
                for img in imgs:
                    imgscr=str(img.get('src'))
                    if len(imgscr)>1 and -1==imgscr.find('http') :
                        img['src']="https://www.xxx.com"+imgscr

                txt=str(soup.find('div',id='bqdiv1'))

                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(viewid)
                    t_imgsrc=list_img[i]
                    k=t_imgsrc.rfind('/')+1
                    bhname=t_imgsrc[k:].replace('-','')
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d:\\upload\\bh\\'+bhname
                            if not os.path.isfile(f_path):
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                self.downloadimg(t_imgsrc,f_path)
                                time.sleep(5)
                            self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                pass
            except FileNotFoundError:
                pass
            except BaseException:
                pass

    def zhisu(self,page,userid):
        #https://www.xxx.com/product/search_j1v_p1v.html
        url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html"
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='gb2312'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            div=soup.find('div',class_="biglist" )
            #print (div.get_text())
            uls=div.find_all('ul',class_='u1')
            for ul in uls:
                a=ul.find('a')
                #print(a.get('href'))
                list_id.append(str(a.get('href')))
                #print(bhname)
                imga=ul.find('img')
                list_img.append(str(imga.get('src')))
                #print(imga)
                #print(imga.get('src'))

                #print("-->"+li.get_text())
        except AttributeError:
            pass

        for viewid in list_id:
            #print(goodsidurl)
            try:
                goodsidurl="https://www.xxx.com"+viewid
                #print(goodsidurl)
                r=requests.get(goodsidurl)
                r.encoding='gb2312'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl

                tit_p=soup.find('div',id='jbmiddle')
                tit=tit_p.find('h1').get_text()
                #print(tit)
                #print("..")
                money=str(soup.find('span',id='nowmoney').get_text())
                #print(money)
                #print(tit.get_text())
                txt_o=soup.find('div',id='bqdiv1')
                imgs=txt_o.find_all('img')
                for img in imgs:
                    imgscr=str(img.get('src'))
                    if len(imgscr)>1 and -1==imgscr.find('http') :
                        img['src']="https://www.xxx.com"+imgscr

                txt=str(soup.find('div',id='bqdiv1'))

                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(viewid)
                    t_imgsrc=list_img[i]
                    k=t_imgsrc.rfind('/')+1
                    bhname=t_imgsrc[k:].replace('-','')
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d:\\upload\\bh\\'+bhname
                            inc=0
                            while((not os.path.isfile(f_path)) and inc<3 ):
                                inc+=1
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                self.downloadimg(t_imgsrc,f_path)
                                time.sleep(5)
                            if  os.path.isfile(f_path):
                                self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                pass
            except FileNotFoundError:
                pass
            except BaseException:
                pass

    def w87zx(self,page,userid):
        #https://www.xxx.com/product/search_j1v_p1v.html
        url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html"
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='gb2312'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            div=soup.find('div',class_="biglist" )
            #print (div.get_text())
            uls=div.find_all('ul',class_='u1')
            for ul in uls:
                a=ul.find('a')
                #print(a.get('href'))
                list_id.append(str(a.get('href')))
                #print(bhname)
                imga=ul.find('img')
                list_img.append(str(imga.get('src')))
                #print(imga)
                #print(imga.get('src'))

                #print("-->"+li.get_text())
        except AttributeError:
            print("AttributeError")

        for viewid in list_id:
            #print(goodsidurl)
            try:
                goodsidurl="https://www.xxx.com/product/"+viewid
                r=requests.get(goodsidurl)
                r.encoding='gb2312'
                time.sleep(3)
                print( time.asctime( time.localtime(time.time()) ))
                soup = BeautifulSoup(r.text,'html.parser')
                thisurl=goodsidurl

                tit_p=soup.find('div',id='jbmiddle')
                tit=tit_p.find('h1').get_text()
                #print(tit)
                #print("..")
                money=str(soup.find('span',id='nowmoney').get_text())
                #print(money)
                txt_o=soup.find('div',id='bqdiv1')
                imgs=txt_o.find_all('img')
                for img in imgs:
                    imgscr=str(img.get('src'))
                    if len(imgscr)>1 and -1==imgscr.find('http') :
                        img['src']="https://www.xxx.com"+imgscr
                txt=str(soup.find('div',id='bqdiv1'))
                #time.sleep(10000000)
                if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
                    i=list_id.index(viewid)
                    t_imgsrc=list_img[i]
                    k=t_imgsrc.rfind('/')+1
                    bhname=t_imgsrc[k:].replace('-','')
                    bh=str(int(time.time()))+"-"+str(userid);
                    xsnum=0
                    if(1==random.randint(0,50)):
                        xsnum=random.randint(1,10)
                    url = "http://xxx/zhf/savedata.php"
                    data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
                    if len(txt)>1:
                        res = requests.post(url=url,data=data)
                        print(res.text)
                        if(res.text.find("success")>1):
                            f_path='d://upload//bh//'+bhname
                            inc=0
                            while((not os.path.isfile(f_path)) and inc<3 ):
                                inc+=1
                                print("download img->"+t_imgsrc+" -> "+f_path)
                                self.downloadimg(t_imgsrc,f_path)
                                time.sleep(5)
                            if  os.path.isfile(f_path):
                                self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
                            time.sleep(5)

                            # os.rename(filepath,filepath.replace(".html",".xxxx"))
                        elif (res.text.find("isexist!")>=0):
                            print(thisurl)
                    else:
                        print("txt is null")
            except AttributeError:
                print("AttributeError "+viewid)
            except FileNotFoundError:
                print("FileNotFoundError "+viewid)
            except BaseException:
                print("BaseException "+viewid)

    def suibianlu_s(self,page,userid):
        url="https://www.xxx.com/code_"+str(page)
        list_id=[]
        list_img=[]
        r=requests.get(url)
        r.encoding='utf-8'
        try:
            soup = BeautifulSoup(r.text,'html.parser')
            lis=soup.find_all('li',class_="clearfix" )
            for li in lis:
                #time.sleep(1)
                #print(li.find('a').get('href'))
                goodsid=li.find('a').get('href')
                if len(goodsid):
                    list_id.append(goodsid)
                    s=goodsid.split('/')
                    bhn=(s[len(s)-1]).replace(".html","")

                imgsrc=li.find('img').get('src')
                if len(imgsrc):
                    list_img.append(imgsrc)
                    #print(bhname)

                #print("-->"+li.get_text())
        except AttributeError:
            pass

        for goodsidurl in list_id:
            #print(goodsidurl)
 &nb
Z-Blog Go

python 调用webdriver 爬取JS动态解密加载的网页数据

评论 (0)

发表评论