3

Aug 13

Decompress gz files on HDFS

I had a task the other day where I had 110GB of compressed log files and wanted to import into Impala (Cloudera). Currently, Impala does not support compressed files so I had to decompress them all. I created this handy script and thought you might find it useful. I mounted the EC2 bucket using s3fs I mentioned in my earlier post.

#!/bin/bash
# Utils
elapsed()
{
   (( seconds  = SECONDS ))
   "$@"
   (( seconds = SECONDS - seconds ))
   (( etime_seconds = seconds % 60 ))
   (( etime_minuts  = ( seconds - etime_seconds ) / 60 % 60 ))
   (( etime_hours   = seconds / 3600 ))
   (( verif = etime_seconds + (etime_minuts * 60) + (etime_hours * 3600) ))

   echo "Elapsed time: ${etime_hours}h ${etime_minuts}m ${etime_seconds}s"
 }

convert()
{
# Remove the .gz extention from the compressed file name
UFILE=`echo ${FILE:0:${#FILE}-3}`

# Decompress gz file
sudo -u hdfs hdfs dfs -cat /user/hdfs/oms/logs/$FILE | \ 
sudo -u hdfs gunzip -d | sudo -u hdfs hdfs dfs -put - /user/hdfs/oms/logs/$UFILE

# Discard original gz file
sudo -u hdfs hdfs dfs -rm -skipTrash /user/hdfs/oms/logs/$FILE
sudo -u hdfs hdfs dfs -ls /user/hdfs/oms/logs/$UFILE
}

for FILE in `ls /media/ephemeral0/logs/`
  do
    elapsed convert $FILE
    echo "Decompressed $FILE to $UFILE on hdfs"
  done

exit 0

No comments yet, be the first.

Leave a Reply