1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
|
import re import os import logging import subprocess from io import StringIO from urllib import request
domain = 'www.example.com'
app_id = 'xxxxxxxxxxxxxxxxx'
token = 'xxxxxxxxxxxxxxxxxxxxxx'
site_map_url = 'https://www.example.com/sitemap.xml'
day_submit_max_lines = 10
day_submit_url = 'http://data.zz.baidu.com/urls?appid={app_id}&token={token}&type=realtime'.format(app_id=app_id, token=token)
day_submit_urls_file = "/tmp/baidu_xiongzhang_day_submit_url.txt"
day_record_file = "/tmp/baidu_xiongzhang_day_record.txt"
log_file = "/tmp/baidu_xiongzhang_day.log"
def regexpMatchUrl(content): pattern = re.findall(r'(http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', content, re.IGNORECASE) if pattern: return True else: return False
def regexpMatchWebSite(content): pattern = re.findall(r''.join(domain), content, re.IGNORECASE) if pattern: return True else: return False
def getUrl(content): pattern = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+.html', content, re.IGNORECASE) if pattern: return pattern[0] else: return ''
def writeRecordFile(record_file_path, content): record_file = open(record_file_path, 'w') record_file.writelines(content) record_file.close()
def readRecordFile(record_file_path): content = "0" if(os.path.exists(record_file_path)): record_file = open(record_file_path, 'r') content = record_file.readline() record_file.close() if(len(content) == 0): content = "0" return content
def countWebsiteMapUrl(): total = 0 content = request.urlopen(site_map_url).read().decode('utf8') website_map_file = StringIO(content) for line in website_map_file: if(regexpMatchUrl(line) and regexpMatchWebSite(line)): total = total + 1 website_map_file.close() return total
def createUrlFile(url_file_path, max_lines): old_index = readRecordFile(day_record_file) content = request.urlopen(site_map_url).read().decode('utf8') website_map_file = StringIO(content) url_file = open(url_file_path, 'w')
index = 0 number = 0 for line in website_map_file: if(regexpMatchUrl(line) and regexpMatchWebSite(line)): if(index < int(old_index)): index = index + 1 continue url = getUrl(line) if(url != ''): index = index + 1 number = number + 1 url_file.writelines(url + "\n") if(number >= max_lines): break
if(index == countWebsiteMapUrl()): writeRecordFile(day_record_file, str(0)) else: writeRecordFile(day_record_file, str(index))
url_file.close() website_map_file.close()
def submitUrlFile(url, url_file_path, log_file): shell_cmd_line = "curl -H 'Content-Type:text/plain' --data-binary @" + url_file_path + " " + '\"' + url + '\"' (status, output) = subprocess.getstatusoutput(shell_cmd_line) logging.info(output + "\n")
if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', filename=log_file) createUrlFile(day_submit_urls_file, day_submit_max_lines) submitUrlFile(day_submit_url, day_submit_urls_file, log_file)
|