JasonWang
/
QQChat


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
							
import re
import tqdm


re_message = re.compile(r'([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}) (.*)')

def read_qq_history_file(filename):
    message = []
    # get line count
    count = -1
    for count, line in enumerate(open(filename, 'r', encoding='utf-8')):
        pass
    count += 1

    # read data
    with open(filename, 'r', encoding='utf-8') as f:
        for i in range(8):
            header = f.readline()
            print(header)

        cur_msg = None

        line = f.readline()
        with tqdm.tqdm(total=count, ascii=True) as pbar:
            while line:
                if line.strip() == '':
                    line = f.readline()
                    continue
                msg_header = re_message.match(line.strip())
                if msg_header:
                    if cur_msg is not None:
                        message.append(cur_msg)
                    cur_msg = {
                        'time': msg_header.group(1),
                        'user': msg_header.group(2),
                        'data': ''
                    }
                else:
                    cur_msg['data'] += line
                line = f.readline()
                pbar.update()
    
    return message


def filter_msg(messages):
    with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
        for each_msg in messages:
            each_msg['data'] = each_msg['data'].replace("\n", '')
            each_msg['data'] = each_msg['data'].replace(r'[图片]', '')
            each_msg['data'] = each_msg['data'].replace(r'[表情]', '')
            each_msg['data'] = re.sub(r'(http|https|ftp)://[0-9a-zA-Z~./_\-]+', '', each_msg['data'])
            each_msg['data'] = re.sub(r'@.+ ', '', each_msg['data'])
            pbar.update()


def generate_dataset(messages, output_path_source, output_path_target):
    prev_msg = None
    with open(output_path_source, 'w', encoding='utf-8') as fs:
        with open(output_path_target, 'w', encoding='utf-8') as ft:
            with tqdm.tqdm(total=len(messages), ascii=True) as pbar:
                for each_msg in messages:
                    if each_msg['data'].strip() == '':
                        continue
                    if prev_msg is not None:
                        fs.write(prev_msg['data'] + '\n')
                        ft.write(each_msg['data'] + '\n')
                    prev_msg = each_msg
                    pbar.update()

if __name__ == "__main__":
    '''
    print('read message')
    msg = read_qq_history_file('data/Octoon 开发组.txt')
    print('filter message')
    filter_msg(msg)
    print('write to file')
    generate_dataset(msg, 'data/octoon_source.txt', 'data/octoon_target.txt')
    '''

    print('read message')
    msg = read_qq_history_file('data/ISOIEC C++ China Unofficial.txt')
    print('filter message')
    filter_msg(msg)
    print('write to file')
    generate_dataset(msg, 'data/train_source.txt', 'data/train_target.txt')