| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- import re
- import tqdm
- re_message = re.compile(r'([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}) (.*)')
- def read_qq_history_file(filename):
- message = []
- # get line count
- count = -1
- for count, line in enumerate(open(filename, 'r', encoding='utf-8')):
- pass
- count += 1
- # read data
- with open(filename, 'r', encoding='utf-8') as f:
- for i in range(8):
- header = f.readline()
- print(header)
- cur_msg = None
- line = f.readline()
- with tqdm.tqdm(total=count, ascii=True) as pbar:
- while line:
- if line.strip() == '':
- line = f.readline()
- continue
- msg_header = re_message.match(line.strip())
- if msg_header:
- if cur_msg is not None:
- message.append(cur_msg)
- cur_msg = {
- 'time': msg_header.group(1),
- 'user': msg_header.group(2),
- 'data': ''
- }
- else:
- cur_msg['data'] += line
- line = f.readline()
- pbar.update()
-
- return message
- def filter_msg(messages):
- with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
- for each_msg in messages:
- each_msg['data'] = each_msg['data'].replace("\n", '')
- each_msg['data'] = each_msg['data'].replace(r'[图片]', '')
- each_msg['data'] = each_msg['data'].replace(r'[表情]', '')
- each_msg['data'] = re.sub(r'(http|https|ftp)://[0-9a-zA-Z~./_\-]+', '', each_msg['data'])
- each_msg['data'] = re.sub(r'@.+ ', '', each_msg['data'])
- pbar.update()
- def generate_dataset(messages, output_path_source, output_path_target):
- prev_msg = None
- with open(output_path_source, 'w', encoding='utf-8') as fs:
- with open(output_path_target, 'w', encoding='utf-8') as ft:
- with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
- for each_msg in messages:
- if each_msg['data'].strip() == '':
- continue
- if prev_msg is not None:
- fs.write(prev_msg['data'] + '\n')
- ft.write(each_msg['data'] + '\n')
- prev_msg = each_msg
- pbar.update()
- if __name__ == "__main__":
- '''
- print('read message')
- msg = read_qq_history_file('data/Octoon 开发组.txt')
- print('filter message')
- filter_msg(msg)
- print('write to file')
- generate_dataset(msg, 'data/octoon_source.txt', 'data/octoon_target.txt')
- '''
- print('read message')
- msg = read_qq_history_file('data/ISOIEC C++ China Unofficial.txt')
- print('filter message')
- filter_msg(msg)
- print('write to file')
- generate_dataset(msg, 'data/train_source.txt', 'data/train_target.txt')
|