bing批量采集URL去重复链接脚本

发布日期: 2019-04-12

阅读次数:

#!/usr/bin/env python
# -*- conding:utf-8 -*-
import re
from selenium import webdriver
import urllib
import argparse
import time
logo = '''
  _______    ___     ___    _      
 |__   __|  / _ \   / _ \  | |     
    | |    | | | | | | | | | |  ___
    | |    | | | | | | | | | | / __|
    | |    | |_| | | |_| | | | \__ \\
    |_|     \___/   \___/  |_| |___/

'''  
print(logo)
def Obtain_url(page_number,your_url): #爬抓bing获取url
    browser = ChromeDriverNOBrowser()
    browser.implicitly_wait(100)
    page = ['&first='+str(i) for i in range(1,page_number,10)]
    crux =your_url
    crux = urllib.parse.quote(crux)   #解决编码报错问题
    with open('url.txt', 'a', encoding='utf-8') as f:
        count_page=1
        for i in page:
            print('正在爬取第'+str(count_page)+'页:')
            count_page=count_page+1
            current_url='https://cn.bing.com/search?q='+crux+i
            print(current_url)
            browser = webdriver.Chrome()
            browser.get(current_url)
            content=browser.page_source
            res = re.compile(r'<h2><a target="_blank" href="(.*?)"')
            data = res.findall(content)
            print(data)
            if len(data)==0:
                continue
            for j in data:
                print(j)
                f.write(j+'\n')


def ChromeDriverNOBrowser():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driverChrome = webdriver.Chrome(chrome_options=chrome_options)
    return driverChrome
def url():  #处理bing爬抓下来的链接
    url = []
    print('链接处理')
    with open('result.txt','r',encoding='utf-8') as f:  #读取文件内容到列表里
        for i in f.readlines():
            url.append(i)
    data = list(set(url))  #去重url列表
    data = sorted(data)    #排列顺序
    with open('new_url.txt','a',encoding='utf-8') as f:  #判断url是否有.php? .asp? .aspx?
        for i in data:
            res = re.compile(r'\.php\?')
            datas = res.findall(i)
            if datas != []:
                f.write(i)
            else:
                res = re.compile(r'\.asp\?')
                datas = res.findall(i)
                if datas != []:
                    f.write(i)
                else:
                    res = re.compile(r'\.aspx\?')
                    datas = res.findall(i)
                    if datas != []:
                        f.write(i)

year = time.localtime().tm_year
mon = time.localtime().tm_mon
day = time.localtime().tm_mday
hour = time.localtime().tm_hour
min = time.localtime().tm_min
sec = time.localtime().tm_sec
nowtime = str(year) + str(mon) + str(day) + str(hour) + str(min) + str(sec)


def openThefile():
    '''
    1、打开要去重的文件
    2、删除每行数据前后的无用字符
    :return:
    '''
    f="new_url.txt"
    ff = open(f,'r')
    l = []
    #for i in ff.readline():     #readline是文件中的第一行内容
    for i in ff.readlines():    #所有内容中的每一行
        if i != '\n' and i != '\r\n':   #删除空行
            ii = i.replace('\t','').strip()
            l.append(ii)
    ff.close()
    return l

def createNewfile(openThefile):
    '''
    去重操作
    :param openThefile:
    :return:
    '''
    l = []
    for i in openThefile:
        if i not in l:
            l.append(i)
    '''
    创建新文件
    '''
    f="new_url.txt"
    filename = f[:f.find('.')]
    postfix = f[f.find('.'):]
    theNewfile = open(filename + '_' + nowtime + postfix,'a')
    '''
    写入处理过的内容
    '''
    for i in l:
        theNewfile.writelines(i+'\n')
    theNewfile.close()

if __name__ == "__main__":
    parser=argparse.ArgumentParser()
    parser.description='自动化采集bing网站的爬虫'
    parser.add_argument('-u','--your_url',help="你的hack语句",type=str)
    parser.add_argument('-p','--page_number',help="你要爬取的页数最大范围",type=int)
    args=parser.parse_args()
    Obtain_url(args.page_number,args.your_url)
    url()
    createNewfile(openThefile())

调试了半天了，提前装好selenium库和谷歌浏览器的驱动，使用方法例如：

python 1.py -u inurl:php?id -p 100

意思是使用谷歌关键字inurl:php?id然后爬取100/10以内的页数

转载请注明: 星晴 bing批量采集URL去重复链接脚本

python 爬虫基本库的使用

1.urllib库python3中将python2中的urllib和urllib2两个库统一起来，统一为urllib库，它分为四个模块。 request:发送HTTP请求的模块。 error:异常处理模块，如果出现请求异常，可以获得异常并且

2019-04-13 starjian

python爬虫

sqlmap工具总结

1. 三种注入方式检测get方式注入 python2 sqlmap.py -u http://127.0.0.1/sqli-labs/Less-1/?id=2 post方式注入 python2 sqlmap.py -u http://127

2019-04-12 starjian

渗透测试

你的赏识是我前进的动力