3
Aug 13
Decompress gz files on HDFS
I had a task the other day where I had 110GB of compressed log files and wanted to import into Impala (Cloudera). Currently, Impala does not support compressed files so I had to decompress them all. I created this handy script and thought you might find it useful. I mounted the EC2 bucket using s3fs I mentioned in my earlier post.
#!/bin/bash
# Utils
elapsed()
{
   (( seconds  = SECONDS ))
   "$@"
   (( seconds = SECONDS - seconds ))
   (( etime_seconds = seconds % 60 ))
   (( etime_minuts  = ( seconds - etime_seconds ) / 60 % 60 ))
   (( etime_hours   = seconds / 3600 ))
   (( verif = etime_seconds + (etime_minuts * 60) + (etime_hours * 3600) ))
   echo "Elapsed time: ${etime_hours}h ${etime_minuts}m ${etime_seconds}s"
 }
convert()
{
# Remove the .gz extention from the compressed file name
UFILE=`echo ${FILE:0:${#FILE}-3}`
# Decompress gz file
sudo -u hdfs hdfs dfs -cat /user/hdfs/oms/logs/$FILE | \ 
sudo -u hdfs gunzip -d | sudo -u hdfs hdfs dfs -put - /user/hdfs/oms/logs/$UFILE
# Discard original gz file
sudo -u hdfs hdfs dfs -rm -skipTrash /user/hdfs/oms/logs/$FILE
sudo -u hdfs hdfs dfs -ls /user/hdfs/oms/logs/$UFILE
}
for FILE in `ls /media/ephemeral0/logs/`
  do
    elapsed convert $FILE
    echo "Decompressed $FILE to $UFILE on hdfs"
  done
exit 0
							
Leave a Reply
You must be logged in to post a comment.