import re import tqdm import jieba re_charset = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]") re_message = re.compile(r'([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}) (.*)(\(.+\)|<.+>)') def read_qq_history_file(filename): message = [] # get line count count = -1 for count, line in enumerate(open(filename, 'r', encoding='utf-8')): pass count += 1 # read data with open(filename, 'r', encoding='utf-8') as f: for i in range(8): header = f.readline() print(header) cur_msg = None line = f.readline() with tqdm.tqdm(total=count, ascii=True) as pbar: while line: if line.strip() == '': line = f.readline() continue msg_header = re_message.match(line.strip()) if msg_header: if cur_msg is not None: message.append(cur_msg) cur_msg = { 'time': msg_header.group(1), 'user': msg_header.group(2), 'id': msg_header.group(3), 'data': '' } else: cur_msg['data'] += line line = f.readline() pbar.update() return message def filter_msg(messages): with tqdm.tqdm(total=len(messages), ascii=True) as pbar: for each_msg in messages: each_msg['data'] = each_msg['data'].replace("\n", '') each_msg['data'] = each_msg['data'].replace(r'[图片]', '') each_msg['data'] = each_msg['data'].replace(r'[表情]', '') each_msg['data'] = re.sub(r'(http|https|ftp)://[0-9a-zA-Z~./_\-]+', '', each_msg['data']) each_msg['data'] = re.sub(r'@.+ ', '', each_msg['data']) each_msg['data'] = re.sub(r'.+加入本群', '', each_msg['data']) each_msg['data'] = re.sub(r'.+被管理员禁言[0-9]{1,2}(分钟|天)', '', each_msg['data']) each_msg['data'] = re.sub(r'.+被管理员解除禁言', '', each_msg['data']) each_msg['data'] = re.sub(r'.+撤回了一条消息', '', each_msg['data']) each_msg['data'] = re.sub(r'\[礼物\] .+成为.+的守护者', '', each_msg['data']) each_msg['data'] = re.sub(r'\[送礼物\] 为.+', '', each_msg['data']) each_msg['data'] = re.sub(r'\[QQ红包\]我发了一个.*', '', each_msg['data']) each_msg['data'] = re.sub(r'\[动作消息\].+', '', each_msg['data']) each_msg['data'] = re.sub(r'\[闪照\].+', '', each_msg['data']) # each_msg['data'] = re_charset.sub("", each_msg['data']) pbar.update() def generate_dataset(messages, output_path_source, output_path_target): prev_msg = None with open(output_path_source, 'w', encoding='utf-8') as fs: with open(output_path_target, 'w', encoding='utf-8') as ft: with tqdm.tqdm(total=len(messages), ascii=True) as pbar: for each_msg in messages: if each_msg['data'].strip() == '': continue if prev_msg is not None: # filter conditions if prev_msg['data'].strip() != each_msg['data'].strip() \ and len(each_msg['data']) < 64 and len(prev_msg['data']) < 64 \ and prev_msg['user'] != each_msg['user'] \ and not each_msg['data'].startswith('安安子') and not prev_msg['data'].startswith('安安子') \ and each_msg['id'] == '(794424922)': prev_seg_list = jieba.cut(prev_msg['data'], cut_all=False) cur_seg_list = jieba.cut(each_msg['data'], cut_all=False) prev_seg_list = " ".join(prev_seg_list) cur_seg_list = " ".join(cur_seg_list) fs.write(prev_msg['id'] + ' : ' + prev_seg_list + '\n') ft.write(cur_seg_list + '\n') prev_msg = each_msg pbar.update() if __name__ == "__main__": print('read message') msg = read_qq_history_file('data/Octoon 开发组.txt') print('filter message') filter_msg(msg) print('write to file') generate_dataset(msg, 'data/valid_source.txt', 'data/valid_target.txt') print('read message') msg = read_qq_history_file('data/ISOIEC C++ China Unofficial.txt') print('filter message') filter_msg(msg) print('write to file') generate_dataset(msg, 'data/train_source.txt', 'data/train_target.txt')