| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- import re
- import tqdm
- import jieba
- re_charset = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]")
- re_message = re.compile(r'([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}) (.*)(\(.+\)|<.+>)')
- def read_qq_history_file(filename):
- message = []
- # get line count
- count = -1
- for count, line in enumerate(open(filename, 'r', encoding='utf-8')):
- pass
- count += 1
- # read data
- with open(filename, 'r', encoding='utf-8') as f:
- for i in range(8):
- header = f.readline()
- print(header)
- cur_msg = None
- line = f.readline()
- with tqdm.tqdm(total=count, ascii=True) as pbar:
- while line:
- if line.strip() == '':
- line = f.readline()
- continue
- msg_header = re_message.match(line.strip())
- if msg_header:
- if cur_msg is not None:
- message.append(cur_msg)
- cur_msg = {
- 'time': msg_header.group(1),
- 'user': msg_header.group(2),
- 'id': msg_header.group(3),
- 'data': ''
- }
- else:
- cur_msg['data'] += line
- line = f.readline()
- pbar.update()
-
- return message
- def filter_msg(messages):
- with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
- for each_msg in messages:
- each_msg['data'] = each_msg['data'].replace("\n", '')
- each_msg['data'] = each_msg['data'].replace(r'[图片]', '')
- each_msg['data'] = each_msg['data'].replace(r'[表情]', '')
- each_msg['data'] = re.sub(r'(http|https|ftp)://[0-9a-zA-Z~./_\-]+', '', each_msg['data'])
- each_msg['data'] = re.sub(r'@.+ ', '', each_msg['data'])
- each_msg['data'] = re.sub(r'.+加入本群', '', each_msg['data'])
- each_msg['data'] = re.sub(r'.+被管理员禁言[0-9]{1,2}(分钟|天)', '', each_msg['data'])
- each_msg['data'] = re.sub(r'.+被管理员解除禁言', '', each_msg['data'])
- each_msg['data'] = re.sub(r'.+撤回了一条消息', '', each_msg['data'])
- each_msg['data'] = re.sub(r'\[礼物\] .+成为.+的守护者', '', each_msg['data'])
- each_msg['data'] = re.sub(r'\[送礼物\] 为.+', '', each_msg['data'])
- each_msg['data'] = re.sub(r'\[QQ红包\]我发了一个.*', '', each_msg['data'])
- each_msg['data'] = re.sub(r'\[动作消息\].+', '', each_msg['data'])
- each_msg['data'] = re.sub(r'\[闪照\].+', '', each_msg['data'])
-
- # each_msg['data'] = re_charset.sub("", each_msg['data'])
- pbar.update()
- def generate_dataset(messages, output_path_source, output_path_target):
- prev_msg = None
- with open(output_path_source, 'w', encoding='utf-8') as fs:
- with open(output_path_target, 'w', encoding='utf-8') as ft:
- with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
- for each_msg in messages:
- if each_msg['data'].strip() == '':
- continue
- if prev_msg is not None:
- # filter conditions
- if prev_msg['data'].strip() != each_msg['data'].strip() \
- and len(each_msg['data']) < 64 and len(prev_msg['data']) < 64 \
- and prev_msg['user'] != each_msg['user'] \
- and not each_msg['data'].startswith('安安子') and not prev_msg['data'].startswith('安安子') \
- and each_msg['id'] == '(794424922)':
- prev_seg_list = jieba.cut(prev_msg['data'], cut_all=False)
- cur_seg_list = jieba.cut(each_msg['data'], cut_all=False)
- prev_seg_list = " ".join(prev_seg_list)
- cur_seg_list = " ".join(cur_seg_list)
- fs.write(prev_msg['id'] + ' : ' + prev_seg_list + '\n')
- ft.write(cur_seg_list + '\n')
- prev_msg = each_msg
- pbar.update()
- if __name__ == "__main__":
- print('read message')
- msg = read_qq_history_file('data/Octoon 开发组.txt')
- print('filter message')
- filter_msg(msg)
- print('write to file')
- generate_dataset(msg, 'data/valid_source.txt', 'data/valid_target.txt')
- print('read message')
- msg = read_qq_history_file('data/ISOIEC C++ China Unofficial.txt')
- print('filter message')
- filter_msg(msg)
- print('write to file')
- generate_dataset(msg, 'data/train_source.txt', 'data/train_target.txt')
|