#!/usr/bin/env python
# -*- conding:utf-8 -*-
import re
from selenium import webdriver
import urllib
import argparse
import time
logo = '''
_______ ___ ___ _
|__ __| / _ \ / _ \ | |
| | | | | | | | | | | | ___
| | | | | | | | | | | | / __|
| | | |_| | | |_| | | | \__ \\
|_| \___/ \___/ |_| |___/
'''
print(logo)
def Obtain_url(page_number,your_url): #爬抓bing获取url
browser = ChromeDriverNOBrowser()
browser.implicitly_wait(100)
page = ['&first='+str(i) for i in range(1,page_number,10)]
crux =your_url
crux = urllib.parse.quote(crux) #解决编码报错问题
with open('url.txt', 'a', encoding='utf-8') as f:
count_page=1
for i in page:
print('正在爬取第'+str(count_page)+'页:')
count_page=count_page+1
current_url='https://cn.bing.com/search?q='+crux+i
print(current_url)
browser = webdriver.Chrome()
browser.get(current_url)
content=browser.page_source
res = re.compile(r'<h2><a target="_blank" href="(.*?)"')
data = res.findall(content)
print(data)
if len(data)==0:
continue
for j in data:
print(j)
f.write(j+'\n')
def ChromeDriverNOBrowser():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driverChrome = webdriver.Chrome(chrome_options=chrome_options)
return driverChrome
def url(): #处理bing爬抓下来的链接
url = []
print('链接处理')
with open('result.txt','r',encoding='utf-8') as f: #读取文件内容到列表里
for i in f.readlines():
url.append(i)
data = list(set(url)) #去重url列表
data = sorted(data) #排列顺序
with open('new_url.txt','a',encoding='utf-8') as f: #判断url是否有.php? .asp? .aspx?
for i in data:
res = re.compile(r'\.php\?')
datas = res.findall(i)
if datas != []:
f.write(i)
else:
res = re.compile(r'\.asp\?')
datas = res.findall(i)
if datas != []:
f.write(i)
else:
res = re.compile(r'\.aspx\?')
datas = res.findall(i)
if datas != []:
f.write(i)
year = time.localtime().tm_year
mon = time.localtime().tm_mon
day = time.localtime().tm_mday
hour = time.localtime().tm_hour
min = time.localtime().tm_min
sec = time.localtime().tm_sec
nowtime = str(year) + str(mon) + str(day) + str(hour) + str(min) + str(sec)
def openThefile():
'''
1、打开要去重的文件
2、删除每行数据前后的无用字符
:return:
'''
f="new_url.txt"
ff = open(f,'r')
l = []
#for i in ff.readline(): #readline是文件中的第一行内容
for i in ff.readlines(): #所有内容中的每一行
if i != '\n' and i != '\r\n': #删除空行
ii = i.replace('\t','').strip()
l.append(ii)
ff.close()
return l
def createNewfile(openThefile):
'''
去重操作
:param openThefile:
:return:
'''
l = []
for i in openThefile:
if i not in l:
l.append(i)
'''
创建新文件
'''
f="new_url.txt"
filename = f[:f.find('.')]
postfix = f[f.find('.'):]
theNewfile = open(filename + '_' + nowtime + postfix,'a')
'''
写入处理过的内容
'''
for i in l:
theNewfile.writelines(i+'\n')
theNewfile.close()
if __name__ == "__main__":
parser=argparse.ArgumentParser()
parser.description='自动化采集bing网站的爬虫'
parser.add_argument('-u','--your_url',help="你的hack语句",type=str)
parser.add_argument('-p','--page_number',help="你要爬取的页数最大范围",type=int)
args=parser.parse_args()
Obtain_url(args.page_number,args.your_url)
url()
createNewfile(openThefile())
调试了半天了,提前装好selenium
库和谷歌浏览器的驱动,使用方法例如:
python 1.py -u inurl:php?id -p 100
意思是使用谷歌关键字inurl:php?id
然后爬取100/10
以内的页数