I had a task the other day where I had 110GB of compressed log files and wanted to import into Impala (Cloudera). Currently, Impala does not support compressed files so I had to decompress them all. I created this handy script and thought you might find it useful. I mounted the EC2 bucket using s3fs I mentioned in my earlier post.
#!/bin/bash
# Utils
elapsed()
{
(( seconds = SECONDS ))
"$@"
(( seconds = SECONDS - seconds ))
(( etime_seconds = seconds % 60 ))
(( etime_minuts = ( seconds - etime_seconds ) / 60 % 60 ))
(( etime_hours = seconds / 3600 ))
(( verif = etime_seconds + (etime_minuts * 60) + (etime_hours * 3600) ))
echo "Elapsed time: ${etime_hours}h ${etime_minuts}m ${etime_seconds}s"
}
convert()
{
# Remove the .gz extention from the compressed file name
UFILE=`echo ${FILE:0:${#FILE}-3}`
# Decompress gz file
sudo -u hdfs hdfs dfs -cat /user/hdfs/oms/logs/$FILE | \
sudo -u hdfs gunzip -d | sudo -u hdfs hdfs dfs -put - /user/hdfs/oms/logs/$UFILE
# Discard original gz file
sudo -u hdfs hdfs dfs -rm -skipTrash /user/hdfs/oms/logs/$FILE
sudo -u hdfs hdfs dfs -ls /user/hdfs/oms/logs/$UFILE
}
for FILE in `ls /media/ephemeral0/logs/`
do
elapsed convert $FILE
echo "Decompressed $FILE to $UFILE on hdfs"
done
exit 0