:将目标文本按行分隔后,把各行文本分配到多个python进程并行分词,然后归并结果,从而获得分词速度的可观提升基于python自带的multiprocessing模块,目前暂不支持windows
ny6fqffe1#
You can try this. It works in Windows.
from path import Path from multiprocessing import Pool import argparse import time LINE_PER_CORE = 5000 NUM_CORE = 30 FLOOR_COUNT = 10 CEIL_COUNT = 200 import jieba def process_one(_in): r_list = [] for l in _in: new_l = ' '.join(jieba.cut(l)) r_list.append(new_l.strip()) return r_list def do(l_list, writer): pool = Pool(NUM_CORE) r_list=pool.map(process_one,[l_list[it:it+LINE_PER_CORE] for it in range(0,len(l_list),LINE_PER_CORE)]) pool.close() pool.join() for lr in r_list: for line in lr: writer.write(line + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-i","--input", help="input folder", default=".") parser.add_argument("-o", "--output", help="output folder", default="w_process") parser.add_argument("--LINE_PER_CORE", help="# lines per core", type=int, default=20000) parser.add_argument("--NUM_CORE", help="# of cores", type=int, default=30) parser.add_argument("--coding", type=str, default="utf-8") args = parser.parse_args() print("Args :", args) input_folder = args.input output_folder = args.output LINE_PER_CORE = args.LINE_PER_CORE NUM_CORE = args.NUM_CORE coding = args.coding if not Path(output_folder).exists(): Path(output_folder).mkdir() for f in Path(input_folder).files('*.txt'): print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime())) with open(output_folder + '/%s.output.txt' % (f.namebase,),'w', encoding='utf-8') as f_out: with open(f.abspath(),'r', encoding='utf-8') as f_in: l_list=[] all_dict = {} for l in f_in: if len(l_list)<NUM_CORE*LINE_PER_CORE: l_list.append(l) else: do(l_list, f_out) print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime())) l_list=[] if len(l_list)>0: do(l_list, f_out) print(time.strftime('%Y-%m-%d %X', time.localtime()))
1条答案
按热度按时间ny6fqffe1#
You can try this. It works in Windows.