JasonWang
/
QQChat


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
							
import re
import tqdm
import jieba

re_charset = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]")
re_message = re.compile(r'([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}) (.*)(\(.+\)|<.+>)')

def read_qq_history_file(filename):
    message = []
    # get line count
    count = -1
    for count, line in enumerate(open(filename, 'r', encoding='utf-8')):
        pass
    count += 1

    # read data
    with open(filename, 'r', encoding='utf-8') as f:
        for i in range(8):
            header = f.readline()
            print(header)

        cur_msg = None

        line = f.readline()
        with tqdm.tqdm(total=count, ascii=True) as pbar:
            while line:
                if line.strip() == '':
                    line = f.readline()
                    continue
                msg_header = re_message.match(line.strip())
                if msg_header:
                    if cur_msg is not None:
                        message.append(cur_msg)

                    cur_msg = {
                        'time': msg_header.group(1),
                        'user': msg_header.group(2),
                        'id': msg_header.group(3),
                        'data': ''
                    }
                else:
                    cur_msg['data'] += line
                line = f.readline()
                pbar.update()
    
    return message


def filter_msg(messages):
    with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
        for each_msg in messages:
            each_msg['data'] = each_msg['data'].replace("\n", '')
            each_msg['data'] = each_msg['data'].replace(r'[图片]', '')
            each_msg['data'] = each_msg['data'].replace(r'[表情]', '')
            each_msg['data'] = re.sub(r'(http|https|ftp)://[0-9a-zA-Z~./_\-]+', '', each_msg['data'])
            each_msg['data'] = re.sub(r'@.+ ', '', each_msg['data'])
            each_msg['data'] = re.sub(r'.+加入本群', '', each_msg['data'])
            each_msg['data'] = re.sub(r'.+被管理员禁言[0-9]{1,2}(分钟|天)', '', each_msg['data'])
            each_msg['data'] = re.sub(r'.+被管理员解除禁言', '', each_msg['data'])
            each_msg['data'] = re.sub(r'.+撤回了一条消息', '', each_msg['data'])
            each_msg['data'] = re.sub(r'\[礼物\] .+成为.+的守护者', '', each_msg['data'])
            each_msg['data'] = re.sub(r'\[送礼物\] 为.+', '', each_msg['data'])
            each_msg['data'] = re.sub(r'\[QQ红包\]我发了一个.*', '', each_msg['data'])
            each_msg['data'] = re.sub(r'\[动作消息\].+', '', each_msg['data'])
            each_msg['data'] = re.sub(r'\[闪照\].+', '', each_msg['data'])
                

            # each_msg['data'] = re_charset.sub("", each_msg['data'])
            pbar.update()


def generate_dataset(messages, output_path_source, output_path_target):
    prev_msg = None
    with open(output_path_source, 'w', encoding='utf-8') as fs:
        with open(output_path_target, 'w', encoding='utf-8') as ft:
            with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
                for each_msg in messages:
                    if each_msg['data'].strip() == '':
                        continue
                    if prev_msg is not None:
                        # filter conditions
                        if prev_msg['data'].strip() != each_msg['data'].strip() \
                            and len(each_msg['data']) < 64 and len(prev_msg['data']) < 64 \
                            and prev_msg['user'] != each_msg['user'] \
                            and not each_msg['data'].startswith('安安子') and not prev_msg['data'].startswith('安安子') \
                            and each_msg['id'] == '(794424922)':
                            prev_seg_list = jieba.cut(prev_msg['data'], cut_all=False)
                            cur_seg_list = jieba.cut(each_msg['data'], cut_all=False)
                            prev_seg_list = " ".join(prev_seg_list)
                            cur_seg_list = " ".join(cur_seg_list)
                            fs.write(prev_msg['id'] + ' : ' + prev_seg_list + '\n')
                            ft.write(cur_seg_list + '\n')
                    prev_msg = each_msg
                    pbar.update()

if __name__ == "__main__":
    print('read message')
    msg = read_qq_history_file('data/Octoon 开发组.txt')
    print('filter message')
    filter_msg(msg)
    print('write to file')
    generate_dataset(msg, 'data/valid_source.txt', 'data/valid_target.txt')


    print('read message')
    msg = read_qq_history_file('data/ISOIEC C++ China Unofficial.txt')
    print('filter message')
    filter_msg(msg)
    print('write to file')
    generate_dataset(msg, 'data/train_source.txt', 'data/train_target.txt')