首页 Python

简介

linux定时任务提醒自己用的,写的有点呆,反爬详见:Python爬虫:某网站cookie参数__jsl_clearance_s生成分析(一)

代码

# -*- coding: utf-8 -*-
import requests
import urllib3
from lxml import etree
import re
import execjs
import hashlib
import json
import random
from time import sleep
from requests.utils import add_dict_to_cookiejar
import datetime
from email.utils import formataddr
import smtplib
from email.mime.text import MIMEText

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def getYesterday():
    today = datetime.date.today()
    oneday = datetime.timedelta(days=1)
    yesterday = today - oneday
    return yesterday

def get_UA():
    UA_list = [
        {'User-Agent': 'Mozilla/4.0 (Mozilla/4.0; MSIE 7.0; Windows NT 5.1; FDM; SV1; .NET CLR 3.0.04506.30)'},
        {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; en) Opera 11.00'},
        {
          'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.2) Gecko/2008092313 Ubuntu/8.04 (hardy) Firefox/3.0.2'},
        {
          'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.9.1.15) Gecko/20101027 Fedora/3.5.15-1.fc12 Firefox/3.5.15'},
        {
          'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.551.0 Safari/534.10'},
        {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.2) Gecko/2008092809 Gentoo Firefox/3.0.2'},
        {
          'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.544.0'},
        {'User-Agent': 'Opera/9.10 (Windows NT 5.2; U; en)'},
        {
          'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko)'},
        {'User-Agent': 'Opera/9.80 (X11; U; Linux i686; en-US; rv:1.9.2.3) Presto/2.2.15 Version/10.10'},
        {
          'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5'},
        {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9b3) Gecko/2008020514 Firefox/3.0b3'},
        {
          'User-Agent': 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_4_11; fr) AppleWebKit/533.16 (KHTML, like Gecko) Version/5.0 Safari/533.16'},
        {
          'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20'},
        {
          'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)'},
        {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux x86_64; en) Opera 9.60'},
        {
          'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.366.0 Safari/533.4'},
        {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.51'}
    ]
    return random.choice(UA_list)

def getCookie(data):
    """
    通过加密对比得到正确cookie参数
    :param data: 参数
    :return: 返回正确cookie参数
    """
    chars = len(data['chars'])
    for i in range(chars):
        for j in range(chars):
            clearance = data['bts'][0] + data['chars'][i] + data['chars'][j] + data['bts'][1]
            encrypt = None
            if data['ha'] == 'md5':
                encrypt = hashlib.md5()
            elif data['ha'] == 'sha1':
                encrypt = hashlib.sha1()
            elif data['ha'] == 'sha256':
                encrypt = hashlib.sha256()
            encrypt.update(clearance.encode())
            result = encrypt.hexdigest()
            if result == data['ct']:
                return clearance

def getResponse(session, url):
    headers = get_UA()
    response = session.get(url, verify=False, headers=headers, timeout=6)
    if response.status_code == 521:
        # 提取js代码
        js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
        # 执行后获得第一次cookie参数jsl_clearance_s
        jsl_clearance_s = str(execjs.eval(js_clearance)).split('=')[1].split(';')[0]
        add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s})
        response2 = session.get(url, verify=False, headers=headers, timeout=5)
        data = json.loads(re.findall(r';go\((.*?)\)', response2.text)[0])
        # 执行后获得第二次cookie参数jsl_clearance_s
        jsl_clearance_s = getCookie(data)
        add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s})
        response = session.get(url, verify=False, headers=headers, timeout=5)
    return response

#发送邮件
def sendmail(recevier,message):
    code = 'bxxxxxa'
    # 第三方 SMTP 服务
    my_sender = 'xxx@qq.com'  # 发件人邮箱账号
    my_pass = code  # 发件人邮箱密码
    my_user = recevier

    ret = True
    try:
        msg = MIMEText(message, 'plain', 'utf-8')
        msg['From'] = formataddr(("人工智障", my_sender))  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
        msg['To'] = formataddr(("QAQ", my_user))  # 括号里的对应收件人邮箱昵称、收件人邮箱账号
        msg['Subject'] = "CNVD编号"  # 邮件的主题,也可以说是标题

        server = smtplib.SMTP_SSL("smtp.qq.com", 465)  # 发件人邮箱中的SMTP服务器,端口是25
        server.login(my_sender, my_pass)  # 括号中对应的是发件人邮箱账号、邮箱密码
        server.sendmail(my_sender, [my_user, ], msg.as_string())  # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
        server.quit()  # 关闭连接
    except Exception:  # 如果 try 中的语句没有执行,则会执行下面的 ret=False
        ret = False
    return ret


def getResults():
    url = "https://www.cnvd.org.cn/flaw/typeResult?typeId=29&max=40&offset=0"
    # CNVD编号记录
    cnvdIds = []
    # 文件记录dict
    records = {}
    # 邮件发送dict
    results = {}
    # 使用session保持会话,随机了User-Agent后无用
    session = requests.session()
    while(True):
        try:
            response = getResponse(session, url)
            break
        except:
            sleep(5)

    xpath_format = etree.HTML(response.text)

    # 获取前一天的url
    urls = ["https://www.cnvd.org.cn" + i for i in xpath_format.xpath("//tbody/tr/td[6][text()='" + str(getYesterday()) + "']/../td[1]/a/@href")]
    # 获取cnvd编号
    [cnvdIds.append(url.split('/')[-1]) for url in urls]
    try:
        with open('cnvdResults.json', 'r+') as f:
            records = json.load(f)
    except:
        pass
    for i in range(len(cnvdIds)):
        if cnvdIds[i] not in records:
            while(True):
                try:
                    sleep(1)
                    response = getResponse(session, urls[i]).text
                    if "请检查您的操作是否正确!您访问的资源不存在或已被删除" in response:
                        break
                    xpath_format = etree.HTML(response)
                    title = xpath_format.xpath("//div[@class='blkContainerSblk']/h1/text()")[0].strip()
                    produt = '\n'.join(i.strip() for i in xpath_format.xpath(
                        "//table[@class='gg_detail']/tbody/tr/td[text()='影响产品']/../td[2]/text()")).strip("\n")
                    description = '\n'.join(i.strip() for i in xpath_format.xpath(
                        "//table[@class='gg_detail']/tbody/tr/td[text()='漏洞描述']/../td[2]/text()")).strip("\n")
                    submissionTime = xpath_format.xpath("//table[@class='gg_detail']/tbody/tr/td[text()='报送时间']/../td[2]/text()")[0].strip()
                    risk = re.findall("([高|中|低])\s*\(<a href=\"#showDiv\" class=\"showInfo\">", response)[0]

                    dict = {'title': title,'risk': risk,'submissionTime': submissionTime, 'url': urls[i], 'product': produt, 'description': description}
                    records[cnvdIds[i]] = dict
                    results[cnvdIds[i]] = dict
                    break
                except Exception as e:
                    sleep(5)
    if results:
        # 写出到文件做记录
        with open('cnvdResults.json', 'w') as f:
            f.write(json.dumps(records, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False))
        # 发送邮件
        sendmail("xxx@qq.com", json.dumps(results, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False))


if __name__ == '__main__':
    getResults()


文章评论