简介
linux定时任务提醒自己用的,写的有点呆,反爬详见:Python爬虫:某网站cookie参数__jsl_clearance_s生成分析(一)
代码
# -*- coding: utf-8 -*-
import requests
import urllib3
from lxml import etree
import re
import execjs
import hashlib
import json
import random
from time import sleep
from requests.utils import add_dict_to_cookiejar
import datetime
from email.utils import formataddr
import smtplib
from email.mime.text import MIMEText
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def getYesterday():
today = datetime.date.today()
oneday = datetime.timedelta(days=1)
yesterday = today - oneday
return yesterday
def get_UA():
UA_list = [
{'User-Agent': 'Mozilla/4.0 (Mozilla/4.0; MSIE 7.0; Windows NT 5.1; FDM; SV1; .NET CLR 3.0.04506.30)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; en) Opera 11.00'},
{
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.2) Gecko/2008092313 Ubuntu/8.04 (hardy) Firefox/3.0.2'},
{
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.9.1.15) Gecko/20101027 Fedora/3.5.15-1.fc12 Firefox/3.5.15'},
{
'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.551.0 Safari/534.10'},
{'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.2) Gecko/2008092809 Gentoo Firefox/3.0.2'},
{
'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.544.0'},
{'User-Agent': 'Opera/9.10 (Windows NT 5.2; U; en)'},
{
'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko)'},
{'User-Agent': 'Opera/9.80 (X11; U; Linux i686; en-US; rv:1.9.2.3) Presto/2.2.15 Version/10.10'},
{
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5'},
{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9b3) Gecko/2008020514 Firefox/3.0b3'},
{
'User-Agent': 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_4_11; fr) AppleWebKit/533.16 (KHTML, like Gecko) Version/5.0 Safari/533.16'},
{
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20'},
{
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; X11; Linux x86_64; en) Opera 9.60'},
{
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.366.0 Safari/533.4'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.51'}
]
return random.choice(UA_list)
def getCookie(data):
"""
通过加密对比得到正确cookie参数
:param data: 参数
:return: 返回正确cookie参数
"""
chars = len(data['chars'])
for i in range(chars):
for j in range(chars):
clearance = data['bts'][0] + data['chars'][i] + data['chars'][j] + data['bts'][1]
encrypt = None
if data['ha'] == 'md5':
encrypt = hashlib.md5()
elif data['ha'] == 'sha1':
encrypt = hashlib.sha1()
elif data['ha'] == 'sha256':
encrypt = hashlib.sha256()
encrypt.update(clearance.encode())
result = encrypt.hexdigest()
if result == data['ct']:
return clearance
def getResponse(session, url):
headers = get_UA()
response = session.get(url, verify=False, headers=headers, timeout=6)
if response.status_code == 521:
# 提取js代码
js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
# 执行后获得第一次cookie参数jsl_clearance_s
jsl_clearance_s = str(execjs.eval(js_clearance)).split('=')[1].split(';')[0]
add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s})
response2 = session.get(url, verify=False, headers=headers, timeout=5)
data = json.loads(re.findall(r';go\((.*?)\)', response2.text)[0])
# 执行后获得第二次cookie参数jsl_clearance_s
jsl_clearance_s = getCookie(data)
add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s})
response = session.get(url, verify=False, headers=headers, timeout=5)
return response
#发送邮件
def sendmail(recevier,message):
code = 'bxxxxxa'
# 第三方 SMTP 服务
my_sender = 'xxx@qq.com' # 发件人邮箱账号
my_pass = code # 发件人邮箱密码
my_user = recevier
ret = True
try:
msg = MIMEText(message, 'plain', 'utf-8')
msg['From'] = formataddr(("人工智障", my_sender)) # 括号里的对应发件人邮箱昵称、发件人邮箱账号
msg['To'] = formataddr(("QAQ", my_user)) # 括号里的对应收件人邮箱昵称、收件人邮箱账号
msg['Subject'] = "CNVD编号" # 邮件的主题,也可以说是标题
server = smtplib.SMTP_SSL("smtp.qq.com", 465) # 发件人邮箱中的SMTP服务器,端口是25
server.login(my_sender, my_pass) # 括号中对应的是发件人邮箱账号、邮箱密码
server.sendmail(my_sender, [my_user, ], msg.as_string()) # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
server.quit() # 关闭连接
except Exception: # 如果 try 中的语句没有执行,则会执行下面的 ret=False
ret = False
return ret
def getResults():
url = "https://www.cnvd.org.cn/flaw/typeResult?typeId=29&max=40&offset=0"
# CNVD编号记录
cnvdIds = []
# 文件记录dict
records = {}
# 邮件发送dict
results = {}
# 使用session保持会话,随机了User-Agent后无用
session = requests.session()
while(True):
try:
response = getResponse(session, url)
break
except:
sleep(5)
xpath_format = etree.HTML(response.text)
# 获取前一天的url
urls = ["https://www.cnvd.org.cn" + i for i in xpath_format.xpath("//tbody/tr/td[6][text()='" + str(getYesterday()) + "']/../td[1]/a/@href")]
# 获取cnvd编号
[cnvdIds.append(url.split('/')[-1]) for url in urls]
try:
with open('cnvdResults.json', 'r+') as f:
records = json.load(f)
except:
pass
for i in range(len(cnvdIds)):
if cnvdIds[i] not in records:
while(True):
try:
sleep(1)
response = getResponse(session, urls[i]).text
if "请检查您的操作是否正确!您访问的资源不存在或已被删除" in response:
break
xpath_format = etree.HTML(response)
title = xpath_format.xpath("//div[@class='blkContainerSblk']/h1/text()")[0].strip()
produt = '\n'.join(i.strip() for i in xpath_format.xpath(
"//table[@class='gg_detail']/tbody/tr/td[text()='影响产品']/../td[2]/text()")).strip("\n")
description = '\n'.join(i.strip() for i in xpath_format.xpath(
"//table[@class='gg_detail']/tbody/tr/td[text()='漏洞描述']/../td[2]/text()")).strip("\n")
submissionTime = xpath_format.xpath("//table[@class='gg_detail']/tbody/tr/td[text()='报送时间']/../td[2]/text()")[0].strip()
risk = re.findall("([高|中|低])\s*\(<a href=\"#showDiv\" class=\"showInfo\">", response)[0]
dict = {'title': title,'risk': risk,'submissionTime': submissionTime, 'url': urls[i], 'product': produt, 'description': description}
records[cnvdIds[i]] = dict
results[cnvdIds[i]] = dict
break
except Exception as e:
sleep(5)
if results:
# 写出到文件做记录
with open('cnvdResults.json', 'w') as f:
f.write(json.dumps(records, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False))
# 发送邮件
sendmail("xxx@qq.com", json.dumps(results, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False))
if __name__ == '__main__':
getResults()
评论已关闭