tensorflow 支持hdfs

2.6 开始需要引入 tensorflow io

才能支持viewfs

从 2.6.0 版本开始,您需要将 tensorflow_io 与 tensorflow 一起导入:

1
2
安装方式:
pip install tensorflow-io

复制

1
2
import tensorflow as tf
import tensorflow_io as tfio

HDFS

环境变量支持

LD_LIBRARY_PATH必须支持检索到libjvm.so,libhdfs.so

CLASSPATH 必须是glob方式才可以

1
2
export CLASSPATH=$(hadoop classpath --glob)
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server/:$HADOOP_HOME/native

libhdfs 编译

以hadoop-2.6.0-src 为例

vim hadoop-common-project/hadoop-annotations/pom.xml

修改 1.7 改成1.8

mvn -Pdist,native clean package -DskipTests -Dmaven.javadoc.skip=true

hadoop-2.6.0-src/hadoop-common-project/hadoop-common/pom.xml

找到javahClassName 写入a.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
<javahClassName>org.apache.hadoop.io.compress.zlib.ZlibCompressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.zlib.ZlibDecompressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.bzip2.Bzip2Compressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.bzip2.Bzip2Decompressor</javahClassName>
<javahClassName>org.apache.hadoop.security.JniBasedUnixGroupsMapping</javahClassName>
<javahClassName>org.apache.hadoop.io.nativeio.NativeIO</javahClassName>
<javahClassName>org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory</javahClassName>
<javahClassName>org.apache.hadoop.security.JniBasedUnixGroupsNetgroupMapping</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.snappy.SnappyCompressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.snappy.SnappyDecompressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.lz4.Lz4Compressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.lz4.Lz4Decompressor</javahClassName>
<javahClassName>org.apache.hadoop.crypto.OpensslCipher</javahClassName>
<javahClassName>org.apache.hadoop.crypto.random.OpensslSecureRandom</javahClassName>
<javahClassName>org.apache.hadoop.util.NativeCrc32</javahClassName>
<javahClassName>org.apache.hadoop.net.unix.DomainSocket</javahClassName>
<javahClassName>org.apache.hadoop.net.unix.DomainSocketWatcher</javahClassName>
1
2

cat a.txt|grep -E -o "org.[^<]+" > class.txt

class.txt 如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
org.apache.hadoop.io.compress.zlib.ZlibCompressor
org.apache.hadoop.io.compress.zlib.ZlibDecompressor
org.apache.hadoop.io.compress.bzip2.Bzip2Compressor
org.apache.hadoop.io.compress.bzip2.Bzip2Decompressor
org.apache.hadoop.security.JniBasedUnixGroupsMapping
org.apache.hadoop.io.nativeio.NativeIO
org.apache.hadoop.io.nativeio.SharedFileDescriptorFactory
org.apache.hadoop.security.JniBasedUnixGroupsNetgroupMapping
org.apache.hadoop.io.compress.snappy.SnappyCompressor
org.apache.hadoop.io.compress.snappy.SnappyDecompressor
org.apache.hadoop.io.compress.lz4.Lz4Compressor
org.apache.hadoop.io.compress.lz4.Lz4Decompressor
org.apache.hadoop.crypto.OpensslCipher
org.apache.hadoop.crypto.random.OpensslSecureRandom
org.apache.hadoop.util.NativeCrc32
org.apache.hadoop.net.unix.DomainSocket
org.apache.hadoop.net.unix.DomainSocketWatcher
1
cat class.txt |xargs javah -d javah -classpath $(hadoop classpath)
1
2
3
4
cd hadoop-hdfs-project/hadoop-hdfs/src/
mkdir build
cd build
cmake -DCMAKE_INSTALL_PREFIX=$HOME/hdfs-native -DGENERATED_JAVAH=../javah -DJVM_ARCH_DATA_MODEL=64 -DREQUIRE_LIBWEBHDFS=OFF -DREQUIRE_FUSE=OFF ..
1
2
3
4
libhdfs.so
target/usr/local/lib/libhdfs.so
header:
main/native/libhdfs/hdfs.h

libhdfs.so api说明:

https://archive.cloudera.com/cdh4/cdh/4/hadoop-2.0.0-cdh4.6.0/hadoop-project-dist/hadoop-hdfs/LibHdfs.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <cstdio>
#include "hdfs.h"

int main(int argc, char **argv) {
hdfsFS fs = hdfsConnect("default", 0);
const char* writePath = "/tmp/testfile.txt";
hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
if(!writeFile) {
fprintf(stderr, "Failed to open %s for writing!\n", writePath);
exit(-1);
}
char* buffer = "Hello, World!";
tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1);
if (hdfsFlush(fs, writeFile)) {
fprintf(stderr, "Failed to 'flush' %s\n", writePath);
exit(-1);
}
hdfsCloseFile(fs, writeFile);
}
1
2
gcc -o main -Wl,-rpath='$ORIGIN/lib' -Iinclude -Llib main.c -lhdfs
CLASSPATH=$(hadoop classpath --glob) ./main