Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
digidigital authored Apr 30, 2022
1 parent 2b002ce commit 6f85195
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 46 deletions.
96 changes: 55 additions & 41 deletions code/OCRthyPDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from copy import deepcopy
from configparser import ConfigParser, NoOptionError
from random import randint
from tempfile import TemporaryDirectory
from tempfile import TemporaryDirectory, tempdir
from shutil import copytree
import logging
import argparse
import darkdetect
Expand Down Expand Up @@ -107,7 +108,7 @@
# Config related
# 'background', temporarilyy diabled since ocrmypdf v13.0.0
stringOptions = ['ocr', 'noise', 'optimization', 'postfix', 'standard', 'confidence',
'deskew', 'rotate', 'sidecar', 'runsplitter', 'tess-thresholding',
'deskew', 'rotate', 'sidecar', 'runsplitter', 'tess-thresholding', 'savesplittext',
'separator', 'separatorpage', 'usesourcename', 'loglevel', 'areafactor']

pathOptions = ['filename','infolder','outfolder']
Expand Down Expand Up @@ -208,6 +209,20 @@ def writeConfig():
with open(configini, 'w+') as configfile:
config.write(configfile)

def updateInitialFolders():
if window['outfolder'].get() != '':
window['outfolder_browse'].InitialFolder = window['outfolder'].get()
else:
window['outfolder_browse'].InitialFolder = user_home
if window['infolder'].get() != '':
window['infolder_browse'].InitialFolder = window['infolder'].get()
else:
window['infolder_browse'].InitialFolder = user_home
if window['filename'].get() != '':
window['file_browse'].InitialFolder = path.dirname(window['filename'].get())
else:
window['file_browse'].InitialFolder = user_home

# Just a simple popup message in a function so I can change formatting and behaviour in one place. :)
def popUp(message):
windowLocation = window.current_location()
Expand Down Expand Up @@ -254,10 +269,12 @@ def toggleButtons():
else:
window['stop_ocr'].update(disabled=True)


def deleteFiles(folder):
for file in listdir(folder):
remove(path.join(folder, file))


def cleanup(Job, popup=True):
#empty queues
while splitJobs.qsize()>0:
Expand All @@ -266,10 +283,7 @@ def cleanup(Job, popup=True):
ocrJobs.get()

Job['ocrQueueLen']=0
Job['splitQueueLen']=0

#clean tmpdir
deleteFiles(tmpdir.name)
Job['splitQueueLen']=0

#enable buttons
toggleButtons()
Expand All @@ -285,7 +299,7 @@ def cleanup(Job, popup=True):

#Reset queue bars in case jobs were stopped by user
window['ocr_queue_bar'].update(0)
window['split_queue_bar'].update(0)

log.info('Cleanup complete.')
return Job

Expand Down Expand Up @@ -318,14 +332,17 @@ def startSplitJob (filename, Job):
if tmpOptions['opt_usesourcename'] == 'no':
args = args + '-d '

# Save txt files
if tmpOptions['opt_savesplittext'] == 'yes':
args = args + '-t '

# Output folder
args = args + " -o '" + tmpdir.name + "'"

commandLine = "'" + scriptRoot + "/splitter.py' '" + Job['file'] + "' " + args
log.debug('Commandline: ' + commandLine)
execute = shlex.split(commandLine)



Job['process'] = subprocess.Popen (execute,stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
#make STDOUT/readline non-blocking!
set_blocking(Job['process'].stdout.fileno(), False)
Expand Down Expand Up @@ -416,7 +433,7 @@ def startOCRJob (filename, Job):
inputFilename = path.basename(Job['file'])
outFileParts = inputFilename.rsplit('.', 1)
outFile = path.join(tmpOptions['outfolder'], outFileParts[0] + tmpOptions['opt_postfix'] + '.' + outFileParts[1])

Job['outfile'] = outFile
commandLine = "ocrmypdf --use-threads " + args + "'" + Job['file'] + "' '" + outFile + "'"

execute = shlex.split(commandLine)
Expand All @@ -438,13 +455,17 @@ def startOCRJob (filename, Job):
# checks the queues according to their priority and starts the next job
def nextJob(previousJob=None):

# check if split produced files and add to ocrQueue, add original file if not
# delete ocr'ed file if a split job ran
if previousJob and previousJob['type'] == 'split':
for file in glob.glob(tmpdir.name + '/*.pdf'):
ocrJobs.put(file)
if ocrJobs.qsize() == 0:
ocrJobs.put(previousJob['file'])

remove(previousJob['file'])
copytree (tmpdir.name , tmpOptions['outfolder'], dirs_exist_ok=True)
deleteFiles(tmpdir.name)

# check if we have to run a split job after ocr
if previousJob and previousJob['type'] == 'ocr':
if tmpOptions['opt_runsplitter'] == 'yes':
splitJobs.put(previousJob['outfile'])

if not previousJob:
log.info('Creating first job.')
previousJob={'running': False, 'ocrQueueLen' : 0, 'splitQueueLen' : 0}
Expand All @@ -456,15 +477,13 @@ def nextJob(previousJob=None):

log.debug('Checking queues for next job. OCR-queue: ' + str(ocrJobs.qsize()) + ', Split-queue:' + str(splitJobs.qsize()))

if ocrJobs.qsize() > 0:
if splitJobs.qsize() > 0:
Job = startSplitJob (splitJobs.get(), previousJob)

elif ocrJobs.qsize() > 0:
Job = startOCRJob (ocrJobs.get(), previousJob)
window['ocr_queue_bar'].update(queuePercent(ocrJobs.qsize(), previousJob['ocrQueueLen']))

elif splitJobs.qsize() > 0:
#delete no longer needed split files from temp folder prior to new split job
deleteFiles(tmpdir.name)
Job = startSplitJob (splitJobs.get(), previousJob)
window['split_queue_bar'].update(queuePercent(splitJobs.qsize(), previousJob['splitQueueLen']))

else:
Job = cleanup(previousJob)

Expand Down Expand Up @@ -515,11 +534,12 @@ def nextJob(previousJob=None):
[sg.T('Separator pattern for QR Code (postfix is optional): <Separator_Code>|<Custom_Postfix>')],
[sg.T('<Custom_Postfix> is added to the filename in "Sticker Mode" if available')],
[sg.T('It replaces the index numbers -> You need to provide different postfixes for all files.')],
[sg.T('Run splitter prior to OCR:'),sg.InputCombo(('yes', 'no'), default_value='no', key='opt_runsplitter', enable_events = True)],
[sg.T('Run splitter after OCR:'),sg.InputCombo(('yes', 'no'), default_value='no', key='opt_runsplitter', enable_events = True)],
[sg.T('Separator code (add at least this to your QR code):'), sg.In('NEXT', key='opt_separator', change_submits = True, size = (15,1), enable_events = True)],
[sg.T('Separator mode?:'), sg.InputCombo(('Drop separator page', 'Sticker Mode'), default_value='Drop separator page', key='opt_separatorpage', tooltip='Sticker Mode: QR Code starts new segment. Page is added to output.', enable_events = True)],
[sg.T('Use source filename in output filename?:'),sg.InputCombo(('yes', 'no'), default_value='yes', key='opt_usesourcename', enable_events = True)],
[sg.T('Limit QR-code search area:'),sg.InputCombo(('1.0','0.5','0.25'), default_value='1', key='opt_areafactor', tooltip='Default: 1.0 - Multiply width and height with this factor to\nlimit the search area and speed up splitting.\n1 = Whole image(page)\n0.5 = Upper left quadrant\n0.25 = Upper left quadrant of upper left quadrant', enable_events = True)]
[sg.T('Limit QR-code search area:'),sg.InputCombo(('1.0','0.5','0.25'), default_value='1', key='opt_areafactor', tooltip='Default: 1.0 - Multiply width and height with this factor to\nlimit the search area and speed up splitting.\n1 = Whole image(page)\n0.5 = Upper left quadrant\n0.25 = Upper left quadrant of upper left quadrant', enable_events = True)],
[sg.T('Save text as separate .txt files:'),sg.InputCombo(('yes', 'no'), default_value='no', key='opt_savesplittext', enable_events = True)]
]

tab3_layout = [
Expand All @@ -532,18 +552,12 @@ def nextJob(previousJob=None):
]

colQueue1 = [
[
sg.Text('Split Job Queue:', pad = ((0,0),(0,0)))
],
[
sg.Text('OCR Job Queue: ', pad = ((0,0),(4,0)))
]
]

colQueue2 = [
[
sg.ProgressBar(100, key='split_queue_bar', size=(44,20))
],
[
sg.ProgressBar(100, key='ocr_queue_bar', size=(44,20))
]
Expand Down Expand Up @@ -584,7 +598,7 @@ def nextJob(previousJob=None):
[
sg.InputText(key='filename_short', disabled_readonly_background_color = readonly_background_color, disabled_readonly_text_color = readonly_text_color, readonly=True, size=(52,1)),
sg.InputText(key='filename', visible=False, readonly=True, enable_events=True),
sg.FileBrowse(('Browse'), file_types=(("PDF", "*.pdf"),("PDF", "*.PDF")), initial_folder=user_home)
sg.FileBrowse(('Browse'), key='file_browse', file_types=(("PDF", "*.pdf"),("PDF", "*.PDF")), initial_folder=user_home)
],
[
sg.InputText(key='infolder_short', disabled_readonly_background_color = readonly_background_color, disabled_readonly_text_color = readonly_text_color, readonly=True, size=(52,1)),
Expand Down Expand Up @@ -643,7 +657,8 @@ def nextJob(previousJob=None):
writeConfig()
else:
readConfig()


updateInitialFolders()

# Event Loop to process "events" and get the "values" of the inputs
while True:
Expand Down Expand Up @@ -741,6 +756,7 @@ def nextJob(previousJob=None):
inFilePath = path.dirname(values['filename'])
window['outfolder'].update(value = inFilePath)
window['outfolder_short'].update(value = limitFilenameLen(inFilePath))
updateInitialFolders()

if event == 'infolder' and not values['infolder'] == '' :
inFolderPath = values['infolder']
Expand All @@ -753,16 +769,18 @@ def nextJob(previousJob=None):
if values['outfolder'] == '':
window['outfolder'].update(value = inFolderPath)
window['outfolder_short'].update(value = limitFilenameLen( inFolderPath))
updateInitialFolders()

if event == 'outfolder' and not values['outfolder'] == '' :
# Shorten filename so it fits in the input text field
window['outfolder_short'].update(value = limitFilenameLen(values['outfolder']))
updateInitialFolders()

if event.startswith('opt_') or event in pathOptions:
writeConfig()

if event == 'start_ocr':
log.info('OCR queues started')
log.info('OCR queue started')
fileList=[]

if values['filename'] != '':
Expand All @@ -773,13 +791,9 @@ def nextJob(previousJob=None):
for file in glob.glob(values['infolder'] + '/*.PDF'):
fileList.append(file)

if values['opt_runsplitter'] == 'yes':
fillQueue = splitJobs
else:
fillQueue = ocrJobs

for file in fileList:
fillQueue.put(file)
ocrJobs.put(file)


#copy values into temporary object to prevent user from changing
#options of already running jobs
Expand All @@ -788,4 +802,4 @@ def nextJob(previousJob=None):
toggleButtons()

# get the first job
Job = nextJob()
Job = nextJob()
33 changes: 28 additions & 5 deletions code/splitter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
#Version 0.6
#Version 0.7

import logging
import argparse
Expand Down Expand Up @@ -93,8 +93,22 @@ def analyzePage(PDF, pageNumber, separator='NEXT', mode='QR', cropfactor=1):

return (pageNumber,separatorCode)

def savePDFTextFile(PDFfile):
'''Save Text in PDF file to a text file'''
logging.debug('Saving text %s.txt file' % PDFfile)
try:
with open(PDFfile, "rb") as fp:
pdfAsText = pdftotext.PDF(fp)
with open(PDFfile+'.txt', 'w') as f:
for page in pdfAsText:
f.write('%s\n' % page)
except Exception as error:
logging.critical('Saving text file %s failed. %e' % (PDFfile, error))
return
logging.debug('Text file saved')


def splitPDF(filename:str, outpath:str, separator='NEXT', mode='QR', stickerMode=False, dropName=False, workers=0, skipRewrite=False, cropfactor=1):
def splitPDF(filename:str, outpath:str, separator='NEXT', mode='QR', stickerMode=False, dropName=False, workers=0, skipRewrite=False, cropfactor=1, extractText=False):
startSplitTime = time.time()
if not skipRewrite:
logging.debug('Rewriting PDF %s to temporary file.' % filename)
Expand Down Expand Up @@ -226,8 +240,11 @@ def splitPDF(filename:str, outpath:str, separator='NEXT', mode='QR', stickerMode
splitPDF.close()
except:
logging.critical('Saving split PDF %s failed.' % saveAs)
#sys.exit("Unable to write split file to output folder.")
continue

if extractText==True:
savePDFTextFile (saveAs)

#Separator pages are dropped
else:
logging.debug('Assembling PDFs in "Separator Page Mode"')
Expand Down Expand Up @@ -263,7 +280,11 @@ def splitPDF(filename:str, outpath:str, separator='NEXT', mode='QR', stickerMode
splitPDF.close()
except:
logging.critical('Saving PDF %s failed.' % saveAs)
#sys.exit("Unable to write split file to output folder.")
continue

if extractText==True:
savePDFTextFile (saveAs)

else:
logging.debug('Segment %s has no pages. Separator on first page, last page or on consecutive pages?'% (str(filenamePostfix)))

Expand Down Expand Up @@ -321,6 +342,8 @@ def splitPDF(filename:str, outpath:str, separator='NEXT', mode='QR', stickerMode
help='Select used separator: QR (default), BARCODE, KEYWORD')
parser.add_argument('-af', '--area-factor', type=float, choices=[(1 * x / 4 ) for x in range(1, 5)], default=1.0,
help='Speed up QR/Barcode search by limiting search area. Origin is top left corner. Default is 1.0 (whole page). E.g. 0.5 is upper left quadrant.')
parser.add_argument('-t', '--extract-text', action='store_true',
help='Save text in separate text file')
parser.add_argument('-o', '--output-folder', metavar='/path/to/output/folder', type=str,
help='Where to save the split files? Default: Same as input folder')
parser.add_argument('--log', default="WARNING", choices=['WARNING', 'INFO', 'DEBUG'],
Expand All @@ -335,5 +358,5 @@ def splitPDF(filename:str, outpath:str, separator='NEXT', mode='QR', stickerMode
else:
raise ValueError('Invalid log level: %s' % loglevel)

for file in splitPDF (args.filename, args.output_folder, args.separator, args.mode, args.sticker_mode, args.drop_filename, args.workers, args.skip_rewrite, args.area_factor):
for file in splitPDF (args.filename, args.output_folder, args.separator, args.mode, args.sticker_mode, args.drop_filename, args.workers, args.skip_rewrite, args.area_factor, args.extract_text):
print(file)

0 comments on commit 6f85195

Please sign in to comment.