注:本脚本不能保证一次完全都能查询成功,但失败的网站会保存在Query failure.csv文件中,成功的网站会保存在webweight.csv文件中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
import requests import csv from lxml import etree import threading from queue import Queue import time from urllib.parse import urlparse
header={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0'} class WebWeight(threading.Thread): def __init__(self,queue): threading.Thread.__init__(self) self.queue = queue def run(self): while not self.queue.empty(): aizhanurl = 'https://www.aizhan.com/cha/' chaxunurl = self.queue.get() print("[+] 正在查询:"+ chaxunurl) url = aizhanurl+chaxunurl time.sleep(4) re = requests.get(url, headers=header) print("[-] 请求url:"+ url) html = re.text.encode(re.encoding).decode('utf-8') tree = etree.HTML(html) tags = tree.xpath('//a[@id="baidurank_br"]//@alt')
with open("webweight.csv", "a+", encoding='utf-8', newline='')as file: csvwriter = csv.writer(file) for tag in tags: strtag = str(tag) if strtag =='n' or strtag == '0': csvwriter.writerow(['' + chaxunurl, '' + '0']) print("[+] 查询结果: "+ chaxunurl+" 权重:"+ str(0)) else: csvwriter.writerow(['' + chaxunurl, '' + tag]) print("[+] 查询结果: " + chaxunurl + " 权重:" + tag)
def main(): with open("webweight.csv", "a", encoding='utf-8', newline='')as file: csvwriter = csv.writer(file) csvwriter.writerow(['weburl','weight']) threads = [] threads_count = 5 queue = Queue() with open("websites.txt", "r")as file: file_content = file.readlines() for i in file_content: j = i.strip('\n').strip('\r') url = urlparse(j) if url.netloc: queue.put(url.netloc) else: queue.put(url.path)
for i in range(threads_count): threads.append(WebWeight(queue)) for i in threads: time.sleep(1) i.start()
for i in threads: time.sleep(1) i.join()
print("Results saved in webweight.csv")
if __name__ == '__main__': main()