cseas · eastonsuo · Jun 9, 2020 · Jun 9, 2020
diff --git a/extract_text_mac.sh b/extract_text_mac.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+BPATH=$1  # Path to directory containing PDFs.
+OPATH=$2  # Path to output directory.
+TPATH=$3
+LANG=$4   # See man tesseract > LANGUAGES
+MIN_WORDS=5     # Number of words required to accept pdftotext result.
+if [ $(echo "$LANG" | wc -c ) -lt 1 ]   # Language defaults to eng.
+    then
+        LANG='eng'
+fi
+# If the output path does not exist, attempt to create it.
+if [ ! -d "$OPATH" ]; then
+    mkdir -p "$OPATH"
+fi
+
+if [ ! -d "$TPATH" ]; then
+    mkdir -p "$TPATH"
+fi
+for FILEPATH in $BPATH*.pdf; do
+    # Extracts plain text content from a PDF.
+    #
+    # First, attempts to extract embedded text with pdftotext. If that fails,
+    #  converts the PDF to TIFF and attempts to perform OCR with Tesseract.
+    #
+    # Path to text file to be created. E.g. ./myfile.txt
+    OUTFILE=$OPATH$(basename $FILEPATH).txt
+    touch "$OUTFILE"    # The text file will be created regardless of whether
+                        #  text is successfully extracted.
+    # First attempt to use pdftotext to extract embedded text.
+    echo -n "Attempting pdftotext extraction..."
+    pdftotext "$FILEPATH" "$OUTFILE"
+    FILESIZE=$(wc -w < "$OUTFILE")
+    echo "extracted $FILESIZE words."
+    # If that fails, try Tesseract.
+    if [[ $FILESIZE -lt $MIN_WORDS ]]
+        then
+            echo -n "Attempting OCR extraction..."
+            # Use imagemagick to convert the PDF to a high-rest multi-page TIFF.
+            convert -density 300 "$FILEPATH" -depth 8 -strip -background white \
+                    -alpha off $TPATH$(basename $FILEPATH).tiff
+            # Then use Tesseract to perform OCR on the tiff.
+            tesseract $TPATH$(basename $FILEPATH).tiff "$OUTFILE" $LANG
+            # We don't need then intermediate TIFF file, so discard it.
+            rm $TPATH$(basename $FILEPATH).tiff
+            FILESIZE=$(wc -w < "$OUTFILE")
+            echo "extracted $FILESIZE words."
+    fi
+
+done
diff --git a/shell_ocr_mac.py b/shell_ocr_mac.py
@@ -0,0 +1,8 @@
+import subprocess
+import shlex
+# shlex is used for passing parameters
+
+# subprocess.call(['./extract_text.sh'])
+# Run above if no parameters are required
+
+subprocess.call(shlex.split('./extract_text.sh ./pdf/ ./txt/ ./tiff/' ))