有些网站用JS 动态解密数据,生成页面,这种直接用Requests库获取数据有些麻烦,但是一向简单粗暴的Python为我们提供了selenium库,可以直接操作浏览器,可见就可得,下面将用一实例体现。
#
from bs4 import BeautifulSoup
from PIL import Image
import requests,pymysql,time,re,random,configparser,os,datetime,sys
from selenium import webdriver
def trim(s):
r = re.findall('[\S]+', s)
return " ".join(r)
class mytool:
ip="127.0.0.1"
user="admin"
passwd="admin"
database="yyy"
idlist=[15,975,978,991,993,994,995,996,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020]
alltypelist=[]
curIndex=0
browser = None
conn=None
def __init__(self):
self.conn = pymysql.connect(self.ip,self.user,self.passwd)
self.alltypelist=self.getallservertype3()
self.browser=webdriver.Edge()
def __del__(self):
self.conn.close()
def getallservertype3(self):
rt=[]
self.conn.select_db(self.database)
cur=self.conn.cursor()#获取游标
cur.execute("select * from yjcode_servertype WHERE admin=3")
while 1:
res=cur.fetchone()
if res is None:
#表示已经取完结果集
break
rt.append(res)
cur.close()
self.conn.commit()
return rt
def getrandomuserid(self):
i=random.randint(0,len(self.idlist)-1)
return self.idlist[i]
def getservertype12id(self,type1name,type2name):
rt1=0
rt2=0
self.conn.select_db(self.database)
cur=self.conn.cursor()#获取游标
cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='"+type2name +"' AND name3='' ; " )
while 1:
res=cur.fetchone()
if res is None:
#表示已经取完结果集
break
rt2=res[0]
cur.close()
self.conn.commit()
cur=self.conn.cursor()#获取游标
cur.execute("select id from yjcode_servertype WHERE name1='"+type1name+"' AND name2='' AND name3='' ; " )
while 1:
res=cur.fetchone()
if res is None:
#表示已经取完结果集
break
rt1=res[0]
cur.close()
self.conn.commit()
return (rt1,rt2)
def checkisexist(self,tit):
rt=0
self.conn.select_db(self.database)
cur=self.conn.cursor()#获取游标
cur.execute("select id from yjcode_server WHERE tit='"+tit+"';")
while 1:
res=cur.fetchone()
if res is None:
#表示已经取完结果集
break
rt=res[0]
cur.close()
self.conn.commit()
return rt
def insertServerdata(self,userid,bh,ty1id,ty2id,ty3id,tit,txt,money):
if userid<1 or len(txt)<1 or len(tit)<1 or len(money)<1:
print("parament is null")
return False
if self.checkisexist(tit)>0:
print("is exist")
return False
dtime=str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
con = pymysql.connect(host=self.ip,database=self.database, charset='utf8',user=self.user, password=self.passwd)
with con.cursor() as cursor:
result = cursor.execute(
'INSERT INTO yjcode_server (userid,bh,ty1id,ty2id,ty3id,zt,sj,lastsj,tit,txt,xsnum,money1,ifxj) \
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s )' ,
(userid,bh,ty1id,ty2id,ty3id,'0',dtime,dtime,tit,txt,'0',money,'0')
)
if result == 1:
print(tit+'添加成功!')
else:
print("fail")
con.commit()
con.close()
return True
def getsrcname(self,src):
new_imgscrobj=re.finditer(r"/[A-Za-z0-9]+\.(gif|png|jpg|bmp)$",src)
new_imgscr=""
for n in new_imgscrobj:
new_imgscr=str(n.group())
return new_imgscr
def getburimg(self,filepath) :
soup = BeautifulSoup(open(filepath,'r',encoding='UTF-8'))
#print (soup.prettify())
huzhanurl=soup.find('a',id='huzhanurl').get_text()
os.system("start D:\\Users\\aihao\\webimg.exe "+huzhanurl)
def downloadimg(self,imgsrc,savepath):
#print(imgsrc,savepath)
r=requests.get(url=imgsrc)
with open(savepath, 'wb') as file:
file.write(r.content)
print('dwonlad img complete')
time.sleep(5)
#file.close()
def createthumbnail(self,imgsrc):
newimg1=imgsrc.replace(".","-1.")
print(sys.platform)
if sys.platform.find("win")>=0:
sysstr="copy "+imgsrc+" "+newimg1
os.system(sysstr)
elif sys.platform.find("linux")>=0:
sysstr="cp "+imgsrc+" "+newimg1
os.system(sysstr)
im=Image.open(imgsrc)
imgsize=im.size
imgth=0.3
im.thumbnail((imgsize[0]*imgth,imgsize[1]*imgth))
#print(im.format,im.size,im.mode)
newimg2=imgsrc.replace(".","-2.")
im.save(newimg2,'JPEG')
def downloadpage(self,index):
#https://www.xxx.com/code/page/21
print(index)
os.system("start msedge https://www.xxx.com/code/page/"+str(index))
def office368(self,pagestr,userid):
#办公采购
#https://www.xxx.com/category.php?id=67
thisurl="https://www.xxx.com/category.php?id="+str(pagestr)
r=requests.get(url=thisurl)
r.encoding='utf-8'
#print(r.text)
list_id=[]
list_img=[]
m_index=0
soup = BeautifulSoup(r.text,'html.parser')
ul=soup.find('ul',class_='list-grid clearfix')
if not ul:
print("ul error ")
return
lis=ul.find_all('li',class_='item')
for li in lis:
href=str(li.find('a').get('href'))
list_id.append(href)
imgsrc=str(li.find('img').get('data-original'))
list_img.append(imgsrc)
#print(href,imgsrc)
#breakIndentationError: unindent does not match any outer indentation level
for viewid in list_id:
m_index+=1
goodsidurl="https://www.xxx.com/"+viewid
thisurl=goodsidurl
r=requests.get(goodsidurl)
r.encoding='utf-8'
time.sleep(3)
print( time.asctime( time.localtime(time.time()) ))
soup = BeautifulSoup(r.text,'html.parser')
tit=soup.find('div',class_='goods-name').get_text()
#print(tit)
#print("..")
money=str(soup.find('font',class_='market-price').get_text()).replace("¥","").replace(",","")
#print(money)
#print(tit.get_text())
#txt_o=soup.find('div',id='bqdiv1')
#imgs=txt_o.find_all('img')
#for img in imgs:
# imgscr=str(img.get('src'))
# if len(imgscr)>1 and -1==imgscr.find('http') :
# img['src']="https://www.xxx.com"+imgscr
txt=str(soup.find('div',class_='right-con')).replace('阳光易购',"")
#time.sleep(10000000)
if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
i=list_id.index(viewid)
t_imgsrc=list_img[i]
k=t_imgsrc.rfind('/')+1
#https://img.xxx.com/imgextra/i4/682114580/TB1IIsEXyqAXuNjy1XdXXaYcVXa_!!0-item_pic.jpg_300x300.jpg
bhname=t_imgsrc[k:].replace('-','').replace('_!!',"").replace(".jpg_","").replace(".png_","")
bh=str(int(time.time()))+"-"+str(userid);
xsnum=0
if(1==random.randint(0,50)):
xsnum=random.randint(1,10)
url = "http://xxxx/zhf/savedata.php"
data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname,"ty1id":39,"ty2id":0,"ty3id":0}
res = requests.post(url=url,data=data)
print(res.text)
if(res.text.find("success")>1):
f_path='d:\\upload\\bh\\'+bhname
inc=0
while((not os.path.isfile(f_path)) and inc<3 ):
inc+=1
if not re.match(r"^http",t_imgsrc):
t_imgsrc="https://www.xxx.com"+t_imgsrc
print("download img->"+t_imgsrc+" -> "+f_path)
self.downloadimg(t_imgsrc,f_path)
time.sleep(5)
if os.path.isfile(f_path):
self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
time.sleep(5)
# os.rename(filepath,filepath.replace(".html",".xxxx"))
elif (res.text.find("isexist!")>=0):
print(thisurl)
if m_index>3:
print("next page")
time.sleep(5)
break
def uploadfile(self,filename,filePath,userid,bh) :
url="http://xxx/zhf/uploadfile.php"
files = {'file': (filename, open(filePath, 'rb'),'image/jpeg')}
data={"userid":userid,"bh":bh}
r= requests.post(url, data=data, files=files)
#print(requests.Request('POST', url,files=files).prepare().body.decode('ascii')) # 鎵撳嵃锟????????????锟藉悕鍜岋拷???????????????????
print(r.text)
def ai_getty1id(self,tit):
if re.search(r'(网站模板)|(网站源码)',tit):
return 37
elif re.search(r'(品牌设)',tit):
return 152
def haozhan(self,page,userid):
#http://www.xxx.com/code?page=1355
url="http://www.xxx.com/code?page="+str(page)
list_id=[]
list_img=[]
r=requests.get(url)
r.encoding='utf-8'
try:
soup = BeautifulSoup(r.text,'html.parser')
div=soup.find('div',class_="list_items" )
#print (div.get_text())
dts=div.find_all('dt')
for dt in dts:
a=dt.find('a')
#print(a.get('href'))
list_id.append(str(a.get('href')))
#print(bhname)
#imga=ul.find('img')
#list_img.append(str(imga.get('src')))
#print(imga)
#print(imga.get('src'))
#print("-->"+li.get_text())
except AttributeError:
pass
for viewid in list_id:
#print(goodsidurl)
try:
goodsidurl="http://www.xxx.com"+viewid
#print(goodsidurl)
r=requests.get(goodsidurl)
r.encoding='utf-8'
time.sleep(3)
print( time.asctime( time.localtime(time.time()) ))
soup = BeautifulSoup(r.text,'html.parser')
thisurl=goodsidurl
tit=soup.find('span',class_='cate').get_text()
print("-->")
print(tit)
time.sleep(180)
break
#print("..")
money=str(soup.find('span',id='nowmoney').get_text())
#print(money)
#print(tit.get_text())
txt_o=soup.find('div',id='bqdiv1')
imgs=txt_o.find_all('img')
for img in imgs:
imgscr=str(img.get('src'))
if len(imgscr)>1 and -1==imgscr.find('http') :
img['src']="https://www.xxx.com"+imgscr
txt=str(soup.find('div',id='bqdiv1'))
#time.sleep(10000000)
if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
i=list_id.index(viewid)
t_imgsrc=list_img[i]
k=t_imgsrc.rfind('/')+1
bhname=t_imgsrc[k:].replace('-','')
bh=str(int(time.time()))+"-"+str(userid);
xsnum=0
if(1==random.randint(0,50)):
xsnum=random.randint(1,10)
url = "http://xxx/zhf/savedata.php"
data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
if len(txt)>1:
res = requests.post(url=url,data=data)
print(res.text)
if(res.text.find("success")>1):
f_path='d:\\upload\\bh\\'+bhname
if not os.path.isfile(f_path):
print("download img->"+t_imgsrc+" -> "+f_path)
self.downloadimg(t_imgsrc,f_path)
time.sleep(5)
self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
time.sleep(5)
# os.rename(filepath,filepath.replace(".html",".xxxx"))
elif (res.text.find("isexist!")>=0):
print(thisurl)
else:
print("txt is null")
except AttributeError:
pass
except FileNotFoundError:
pass
except BaseException:
pass
def zhisu(self,page,userid):
#https://www.xxx.com/product/search_j1v_p1v.html
url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html"
list_id=[]
list_img=[]
r=requests.get(url)
r.encoding='gb2312'
try:
soup = BeautifulSoup(r.text,'html.parser')
div=soup.find('div',class_="biglist" )
#print (div.get_text())
uls=div.find_all('ul',class_='u1')
for ul in uls:
a=ul.find('a')
#print(a.get('href'))
list_id.append(str(a.get('href')))
#print(bhname)
imga=ul.find('img')
list_img.append(str(imga.get('src')))
#print(imga)
#print(imga.get('src'))
#print("-->"+li.get_text())
except AttributeError:
pass
for viewid in list_id:
#print(goodsidurl)
try:
goodsidurl="https://www.xxx.com"+viewid
#print(goodsidurl)
r=requests.get(goodsidurl)
r.encoding='gb2312'
time.sleep(3)
print( time.asctime( time.localtime(time.time()) ))
soup = BeautifulSoup(r.text,'html.parser')
thisurl=goodsidurl
tit_p=soup.find('div',id='jbmiddle')
tit=tit_p.find('h1').get_text()
#print(tit)
#print("..")
money=str(soup.find('span',id='nowmoney').get_text())
#print(money)
#print(tit.get_text())
txt_o=soup.find('div',id='bqdiv1')
imgs=txt_o.find_all('img')
for img in imgs:
imgscr=str(img.get('src'))
if len(imgscr)>1 and -1==imgscr.find('http') :
img['src']="https://www.xxx.com"+imgscr
txt=str(soup.find('div',id='bqdiv1'))
#time.sleep(10000000)
if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
i=list_id.index(viewid)
t_imgsrc=list_img[i]
k=t_imgsrc.rfind('/')+1
bhname=t_imgsrc[k:].replace('-','')
bh=str(int(time.time()))+"-"+str(userid);
xsnum=0
if(1==random.randint(0,50)):
xsnum=random.randint(1,10)
url = "http://xxx/zhf/savedata.php"
data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
if len(txt)>1:
res = requests.post(url=url,data=data)
print(res.text)
if(res.text.find("success")>1):
f_path='d:\\upload\\bh\\'+bhname
inc=0
while((not os.path.isfile(f_path)) and inc<3 ):
inc+=1
print("download img->"+t_imgsrc+" -> "+f_path)
self.downloadimg(t_imgsrc,f_path)
time.sleep(5)
if os.path.isfile(f_path):
self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
time.sleep(5)
# os.rename(filepath,filepath.replace(".html",".xxxx"))
elif (res.text.find("isexist!")>=0):
print(thisurl)
else:
print("txt is null")
except AttributeError:
pass
except FileNotFoundError:
pass
except BaseException:
pass
def w87zx(self,page,userid):
#https://www.xxx.com/product/search_j1v_p1v.html
url="https://www.xxx.com/product/search_j1v_p"+str(page)+"v.html"
list_id=[]
list_img=[]
r=requests.get(url)
r.encoding='gb2312'
try:
soup = BeautifulSoup(r.text,'html.parser')
div=soup.find('div',class_="biglist" )
#print (div.get_text())
uls=div.find_all('ul',class_='u1')
for ul in uls:
a=ul.find('a')
#print(a.get('href'))
list_id.append(str(a.get('href')))
#print(bhname)
imga=ul.find('img')
list_img.append(str(imga.get('src')))
#print(imga)
#print(imga.get('src'))
#print("-->"+li.get_text())
except AttributeError:
print("AttributeError")
for viewid in list_id:
#print(goodsidurl)
try:
goodsidurl="https://www.xxx.com/product/"+viewid
r=requests.get(goodsidurl)
r.encoding='gb2312'
time.sleep(3)
print( time.asctime( time.localtime(time.time()) ))
soup = BeautifulSoup(r.text,'html.parser')
thisurl=goodsidurl
tit_p=soup.find('div',id='jbmiddle')
tit=tit_p.find('h1').get_text()
#print(tit)
#print("..")
money=str(soup.find('span',id='nowmoney').get_text())
#print(money)
txt_o=soup.find('div',id='bqdiv1')
imgs=txt_o.find_all('img')
for img in imgs:
imgscr=str(img.get('src'))
if len(imgscr)>1 and -1==imgscr.find('http') :
img['src']="https://www.xxx.com"+imgscr
txt=str(soup.find('div',id='bqdiv1'))
#time.sleep(10000000)
if(len(money)>1 and len(tit)>1 and len(txt)>1 ):
i=list_id.index(viewid)
t_imgsrc=list_img[i]
k=t_imgsrc.rfind('/')+1
bhname=t_imgsrc[k:].replace('-','')
bh=str(int(time.time()))+"-"+str(userid);
xsnum=0
if(1==random.randint(0,50)):
xsnum=random.randint(1,10)
url = "http://xxx/zhf/savedata.php"
data = {"thisurl":thisurl,"xsnum":xsnum,"userid":userid,"bh":bh,'tit':tit,'money':money,"txt":txt,'bauto':0,'burl':"","bh_name":bhname}
if len(txt)>1:
res = requests.post(url=url,data=data)
print(res.text)
if(res.text.find("success")>1):
f_path='d://upload//bh//'+bhname
inc=0
while((not os.path.isfile(f_path)) and inc<3 ):
inc+=1
print("download img->"+t_imgsrc+" -> "+f_path)
self.downloadimg(t_imgsrc,f_path)
time.sleep(5)
if os.path.isfile(f_path):
self.uploadfile(bhname,"d:\\upload\\bh\\"+bhname,userid,bh)
time.sleep(5)
# os.rename(filepath,filepath.replace(".html",".xxxx"))
elif (res.text.find("isexist!")>=0):
print(thisurl)
else:
print("txt is null")
except AttributeError:
print("AttributeError "+viewid)
except FileNotFoundError:
print("FileNotFoundError "+viewid)
except BaseException:
print("BaseException "+viewid)
def suibianlu_s(self,page,userid):
url="https://www.xxx.com/code_"+str(page)
list_id=[]
list_img=[]
r=requests.get(url)
r.encoding='utf-8'
try:
soup = BeautifulSoup(r.text,'html.parser')
lis=soup.find_all('li',class_="clearfix" )
for li in lis:
#time.sleep(1)
#print(li.find('a').get('href'))
goodsid=li.find('a').get('href')
if len(goodsid):
list_id.append(goodsid)
s=goodsid.split('/')
bhn=(s[len(s)-1]).replace(".html","")
imgsrc=li.find('img').get('src')
if len(imgsrc):
list_img.append(imgsrc)
#print(bhname)
#print("-->"+li.get_text())
except AttributeError:
pass
for goodsidurl in list_id:
#print(goodsidurl)
&nb