|
|
- # _*_coding:utf-8_*_
- from hashlib import new
- import json
- import os
- import sys
- import time
- from multiprocessing import Process
- import time
- from datetime import datetime, timedelta
- import os
- import pickle
- import calendar
- import re
- from copy import deepcopy
- import requests
- import csv
- from commandline_config import Config
- from service_invoke import invokeService
-
-
- class TimeUtil(object):
- @classmethod
- def parse_timezone(cls, timezone):
- """
- 解析时区表示
- :param timezone: str eg: +8
- :return: dict{symbol, offset}
- """
- result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
- symbol = result.groupdict()['symbol']
- offset = int(result.groupdict()['offset'])
-
- return {
- 'symbol': symbol,
- 'offset': offset
- }
-
- @classmethod
- def convert_timezone(cls, dt, timezone="+0"):
- """默认是utc时间,需要"""
- result = cls.parse_timezone(timezone)
- symbol = result['symbol']
-
- offset = result['offset']
-
- if symbol == '+':
- return dt + timedelta(hours=offset)
- elif symbol == '-':
- return dt - timedelta(hours=offset)
- else:
- raise Exception('dont parse timezone format')
-
-
- def generate_timestamp():
- current_GMT = time.gmtime()
- # ts stores timestamp
- ts = calendar.timegm(current_GMT)
-
- current_time = datetime.utcnow()
- convert_now = TimeUtil.convert_timezone(current_time, '+8')
- print("current_time: " + str(convert_now))
- return str(convert_now)
-
-
- def main():
- # result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
- # res = result.read()
- # for line in res.splitlines():
- # print("\n\n\n\nfinename:\n\n\n\n\n", line)
- config = {
- "pages": 5,
- "test": False,
- "test_pages": 3,
- }
- c = Config(config)
- print(c)
- csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
- author_list = []
- for line in csv_reader:
- author_list.append(line[4])
-
- csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
- keywords = []
- i = 0
- for line in csv_reader:
- if line[0] not in author_list:
- keywords.append(line[0])
- else:
- print("Will not append keyword %s", line[0])
- i += 1
- if c.test and i > c.test_pages * 100:
- break
- # print("author_list:", author_list)
- # exit(0)
-
- urlList = ""
- i = 0
-
- for keyword in keywords:
- url = "https://so.toutiao.com/search?dvpf=pc&source=input&keyword=%s&pd=user&action_type=search_subtab_switch&page_num=0&from=media&cur_tab_title=media\r\n" % keyword
- # print(url)
- urlList += url
- i += 1
- if c.test and i > c.test_pages:
- break
- print(urlList)
- # exit(0)
- # result = requests.post(
- # "http://servicewrapper.naibo.wang/backEnd/invokeService",
- # data={"id": 7, # serviceID
- # "paras": json.dumps({"urlList_0": urlList,
- # }),
- # })
- # descTaskID = int(result.text)
- descTaskID = invokeService(
- 1, {"urlList_0": urlList})
- print("descTaskID: " + str(descTaskID))
- # exit(0)
- filename = generate_timestamp().replace(" ", "").replace(":", "-")
- print("filename:", filename)
-
- command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
- str(descTaskID) + ' ' + filename
- result = os.system(command)
-
- # authorTaskID = 53
- file_name = "task_" + str(descTaskID) + "_" + filename + ".csv"
- # file_name = "task_53_2022-10-1723-35-40.881448.csv"
- print("file_name:", file_name)
- csv_reader = csv.reader(
- open("./Data/"+file_name, encoding='utf-8')) # taskID
- new_descTaskID = []
- i = 0
- for line in csv_reader:
- # print(line)
- if i > 0:
- new_descTaskID.append(line)
- i += 1
- # print(new_author_list)
- # new_descTaskID = list(set([tuple(t) for t in new_descTaskID]))
- # new_descTaskID = list(set(new_descTaskID)) # 去重
-
- after_remove_duplicate = []
- for i in range(len(new_descTaskID)):
- try:
- if i > 0:
- if new_descTaskID[i][2] == new_descTaskID[i-1][2]:
- continue
- if new_descTaskID[i][2] != "":
- zan = new_descTaskID[i][1].split("获赞")[0]
- fans = new_descTaskID[i][1].split("粉丝")[0].split("获赞")[1]
- follow = new_descTaskID[i][1].split("关注")[0].split("粉丝")[1]
- after_remove_duplicate.append(
- [new_descTaskID[i][0], zan, fans, follow, new_descTaskID[i][2], new_descTaskID[i][3]])
- except:
- pass
-
- print("after_remove_duplicate", after_remove_duplicate)
-
- all_collected = []
- for author in after_remove_duplicate:
- all_collected.append(author[4])
- print("all_collected:", all_collected)
-
- for keyword in keywords:
- if keyword not in all_collected:
- print("keyword not collected:", keyword)
- after_remove_duplicate.append(['', '', '', '', keyword, ''])
-
- new_descTaskID = after_remove_duplicate
-
- print("new_descTaskID:", new_descTaskID)
-
- # for i in range(len(keywords)):
- # author_list[i] = [keywords[i]].extend(new_descTaskID[i])
- # for row in author_list:
- # print(row)
-
- with open("raw_data.csv", "a", encoding='utf-8', newline='') as csvfile:
- writer = csv.writer(csvfile)
- for row in new_descTaskID:
- writer.writerow(row)
-
- import xlwt
-
- csv_reader = csv.reader(open("./raw_data.csv", encoding='utf-8'))
- all_data = []
- for line in csv_reader:
- all_data.append(line)
-
- workbook = xlwt.Workbook()
- sheet = workbook.add_sheet("Sheet")
-
- for i in range(len(all_data)):
- for j in range(len(all_data[i])):
- sheet.write(i, j, all_data[i][j])
-
- workbook.save("all_data.xls")
-
-
- if __name__ == "__main__":
- main()
|