#!/bin/bash

#
# getMails - this gets mails and mbox files and splits mbox files into single mails
#
#	License: this script comes with absolutely no warranty
#						USE IT (this script) ON YOUR OWN RISK !
#				 This script is free to use. Please let me know if
#				 it was of any use for you.
#
#	Author: Jean-Michel Bruenn <himself@jeanbruenn.info>
#	Date: 27th April 2010

# extensions which will most likely contain emails
good_extensions="mail email eml mbox mbx"
# extensions which will NOT contain emails (don't list html
#          some data recovery software might detect html
#         mails as .html
bad_extensions="ini desktop log sqlite dat dmp extra json aac avi mpg mpeg mp3 mp4 exe gif jpg png bmp ogg ac3 wav sh php pdf tar gz bz2 zip rar odt patch odt csv ods conf 264"
# maximum allowed filesize in KB (default 4 GB) - matters only for files
# where no extension matched.
max_filesize=4194304;

work_dir="$1";
save_dir="$2";
tmp_scandir=$save_dir/mbox
tmp_extension="eml";

mkdir -p "$save_dir/mbox"
mkdir -p "$save_dir/mails"

function checkFilename()
{

  local save_dir="$1";
  local basename="$2";
  local extension="$3";

  # does the file exist?
  if [ -f "$save_dir/$basename.$extension" ]; then
    echo "Same filename detected, renaming...";
    for (( a=1; $a \<= 99999; a++ )) ; do
      if [ !  -f "$save_dir/$basename.$a.$extension" ] ; then
        # wow,  we got a free filename. Let's break the
        # for and give back  the new filename
        newfilename="$basename.$a.$extension";
        break;
      fi
    done
    echo "Renamed to: $newfilename";
  else
    newfilename="$basename.$extension";
  fi

}

function splitMbox()
{

  local mbox="$1";
  FILENO=000000;
  export FILENO
  export tmp_scandir
  export tmp_extension

  if [ -d "$tmp_scandir" ]; then
    mkdir -pv "$tmp_scandir";
  fi

  echo " + Extracting mails from $mbox..."
  formail -d -s sh -c 'cat - >$tmp_scandir/$FILENO.$tmp_extension' <$mbox
  NEWF=000000; I=0;
  while [ -e $tmp_scandir/$NEWF.$tmp_extension ]; do
    I=$((I+1)); NEWF=`printf "%06d" $I`
  done
  LASTFILE=`printf "%06d" $((I-1))`
  echo "Split $mbox into files 000000.$tmp_extension thru $LASTFILE.$tmp_extension..."

}

if [ -f /tmp/getMails.txt ] ; then
   rm -rf /tmp/getMails.txt
   echo "+ deleted /tmp/getMails.txt" >&2
fi

echo "+ getting filelist...";

while IFS= read -r -d $'\0' file; do

  # strip everything up to the last slash (/)
  #filename=$(echo "$file" |  sed -r 's|.*/||');
  filename=${file##*/};
  # strip everything after the last dot (.)
  #filenamepre=$(echo "$filename" | sed -r 's|\..*||');
  basename=${filename%.*};
  # strip  everything before the last dot (.)
  #filenamepost=$(echo "$filename" |  sed -r 's|.*\.||');
  extension=${filename##*.};

  sum=$(sha1sum "$file" | cut -d" " -f1);
  filesize=$(du "$file" | awk '{ print $1 }');

  # detect copies to not process already processed files
  if [ -f /tmp/getMails.txt ]; then
    if [[ $(grep "$sum" /tmp/getMails.txt -c) -gt 0 ]] ; then
      echo " + Copy detected, skipping...";
      continue 1;
    fi
  fi

  if [[ -z $extension ]] || [[ "$extension" = "$basename" ]]; then
    extension="";
  else
    for x in $bad_extensions; do
      if [[ "$extension" = "$x" ]]; then
        echo " + Found BAD_EXTENSION: $x, skipping $filename...";
        continue 2;
      fi
    done

    for y in $good_extensions; do
      if [[ "$extension" = "$y" ]]; then
        echo " + Found GOOD_EXTENSION: $y, adding $filename to list...";
        SUM=$(sha1sum "$file" | cut -d" " -f1);
        TOADD="$file:$SUM";
        echo "$TOADD" >> /tmp/getMails.txt
        echo "+ added $TOADD" >&2
        continue 2;
      fi
    done

  fi

  # still here?
  mime=$(file -b "$file");
  if [[ $(echo "$mime" | grep "html\|mail" -c) -gt 0 ]]; then
    if [ $filesize -gt $max_filesize ]; then
      echo " + File is too big, skipping...";
      continue 1;
    fi

    if [[ $(grep -e "^From: " "$file" -c) -eq 1 ]] && [[ $(grep -e "^To: " "$file" -c) -eq 1 ]]; then
      echo " + Found a good (single) mail 'candidate', adding $file to list...";
      TOADD="$file:$sum";
      echo "$TOADD" >> /tmp/getMails.txt
      echo "+ added $TOADD" >&2
    elif [[ $(grep -e "^From: " "$file" -c) -gt 1 ]] && [[ $(grep -e "^To: " "$file" -c) -gt 1 ]]; then
      echo " + Found a good (MBOX) mail 'candidate', adding $file to list...";
      TOADD="$file:$sum";
      echo "$TOADD" >> /tmp/getMails.txt
      echo "+ added $TOADD" >&2
    fi
  fi

done < <(find "$work_dir" -type f -print0);

if [ ! -f /tmp/getMails.txt ]; then
  echo " + Filelist is gone, aborting...";
else
  echo " + Processing filelist...";
  E_FILES=$(cat /tmp/getMails.txt);
  IFS=$'\n';
  for LINE in $E_FILES; do

    file=$(echo "$LINE" | cut -d":" -f1);
    echo " + Processing $file...";
    # strip everything up to the last slash (/)
    #filename=$(echo "$file" |  sed -r 's|.*/||');
    filename=${file##*/};
    # strip everything after the last dot (.)
    #filenamepre=$(echo "$filename" | sed -r 's|\..*||');
    basename=${filename%.*};
    # strip  everything before the last dot (.)
    #filenamepost=$(echo "$filename" |  sed -r 's|.*\.||');
    extension=${filename##*.};

    if [[ $(grep -e "^From: " "$file" -c) -eq 1 ]] && [[ $(grep -e "^To: " "$file" -c) -eq 1 ]]; then

      echo " + Found single mail 'candidate', copying to $save_dir/mails/...";
      checkFilename "$save_dir/mails" "$basename" "$extension";

      echo " + Copying as $newfilename to $save_dir";
      cp -rva "$file" "$save_dir/mails/$newfilename";

    elif [[ $(grep -e "^From: " "$file" -c) -gt 1 ]] && [[ $(grep -e "^To: " "$file" -c) -gt 1 ]]; then

      echo " + Found mbox mail 'candidate', extracting to $save_dir/mbox/...";

      splitMbox "$file";

      while IFS= read -r -d $'\0' mbox; do

        # strip everything up to the last slash (/)
        #filename=$(echo "$file" |  sed -r 's|.*/||');
        mfilename=${mbox##*/};
        # strip everything after the last dot (.)
        #filenamepre=$(echo "$filename" | sed -r 's|\..*||');
        mbasename=${mfilename%.*};
        # strip  everything before the last dot (.)
        #filenamepost=$(echo "$filename" |  sed -r 's|.*\.||');
        mextension=${mfilename##*.};
        checkFilename "$save_dir/mails" "$mbasename" "$mextension";

        echo " + $filename: Copying as $newfilename to $save_dir/mails";
        cp -rva "$mbox" "$save_dir/mails/$newfilename";

      done < <(find "$tmp_scandir" -type f -print0);

      echo "Deleting contents of tmp dir";
      rm -vf $tmp_scandir/*

    fi
  done
fi

echo " + Finished. Now import your mails :-)";
