pydoop在hdfs文件的readline上运行

ztmd8pv5  于 2021-06-03  发布在  Hadoop
关注(0)|答案(1)|浏览(410)

我正在读一个目录中所有文件的第一行,在本地它工作正常,但在emr上这个测试失败,停留在大约200-300个文件。同时ps-elf显示儿童数量增加到3000甚至在第200行打印。
在emr上读取最大字节是不是有问题?pydoop版本pydoop==0.12.0

import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs

def prepare_data(hdfs_folder):
    folder = "test_folder"
    copies_count = 700
    src_file = "file"

    #1) create a folder
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.makedirs(folder)

    #2) create XXX copies of file in folder
    for x in range(0, copies_count):
        shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))

    #3) copy folder to hdfs
    #hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
    remove_command = "hadoop fs -rmr "+ hdfs_folder
    print remove_command
    os.system(remove_command)
    command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
    print command
    os.system(command)

def main(hdfs_folder):
    try:
        conn_hdfs = hdfs.fs.hdfs()
        if conn_hdfs.exists(hdfs_folder):
            items_list = conn_hdfs.list_directory(hdfs_folder)
            for item in items_list:
                if not item["kind"] == "file":
                    continue
                file_name = item["name"]
                print "validating file : %s" % file_name

                try:
                    file_handle = conn_hdfs.open_file(file_name)
                    file_line = file_handle.readline()
                    print file_line
                    file_handle.close()
                except Exception as exp:
                    print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
                    file_handle.close()
                    continue

        conn_hdfs.close()

    except Exception as e:
        print "####Exception \'%s\' in validating files!" % str(e)

if __name__ == '__main__':

    hdfs_path = '/abc/xyz'
    prepare_data(hdfs_path)

    main(hdfs_path)
nkoocmlb

nkoocmlb1#

我建议使用 subprocess 用于读取第一行而不是 pydoopconn_hdfs.open_file ```
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue

相关问题