Context Navigation

update_dates.py@ 655

Last change on this file since 655 was 648, checked in by Nicholas Riley, 12 years ago

update_dates.py: Don't time out while waiting for a document to OCR.

Fix a longstanding issue where clicking "Cancel" raised an exception.

Also write plist on every scanned document (this change was made quite
a while ago but not checked in until now).

File size: 6.1 KB

Rev	Line
[564]	1	from appscript import *
[566]	2	from datetime import datetime
	3	from osax import *
	4	from plistlib import readPlist, writePlist
	5	import os
[564]	6	import re
[566]	7	import time
[564]	8
[566]	9	PREFERENCES_PATH = \
	10	os.path.expanduser('~/Library/Preferences/net.sabi.UpdateDates.plist')
	11
[564]	12	DATE_FORMATS = (('%m/%d/%y', r'\d{1,2}/\d{1,2}/\d{1,2}' ), # T-Mobile
	13	('%m.%d.%y', r'\d{1,2}\.\d{1,2}\.\d{1,2}' ), # iFixit
	14	('%b %d, %Y', r'[A-Z][a-z][a-z] \d{1,2}, \d{4}'), # AmerenIP
	15	('%B %d, %Y', r'[A-Z][a-z]+ ?\d{1,2}, ?\d{4}' ), # Amazon
	16	('of %Y%m%d', r'of \d{8}' ), # Amazon
	17	('%m/%d/%Y', r'\d{1,2}/\d{1,2}/\d{4}' ), # Busey
	18	('%b %d %Y', r'[A-Z]{3} \d{1,2} \d{4}' ), # State Farm
	19	('%d %b %Y', r'\d{1,2} [A-Z][A-Za-z]{2} \d{4}'), # Apple
	20	('%Y-%m-%d', r'\d{4}-\d{2}-\d{2}' ), # MacSpeech
	21	('%Y-%m', r'\d{4}-\d{2}' ), # filename
	22	('%m1%d/%y', r'\d{1,2}1\d{1,2}/\d{1,2}' ), # T-Mo bad OCR
	23	('%m/%d1%y', r'\d{1,2}/\d{1,2}1\d{1,2}' ), # T-Mo bad OCR
	24	('%m/%d/%y', r'\d{1,2}/ \d{1,2}/ \d{1,2}' ), # T-Mo bad OCR
	25	('%m/%d/%Y',
	26	r'(?:\d ?){1,2}/ (?:\d ?){1,2}/ (?:\d ?){4}' ), # Busey bad OCR
	27	)
	28
	29	RE_DATE = re.compile('\|'.join(r'(\b%s\b)' % regex
	30	for format, regex in DATE_FORMATS))
	31
	32	def extract_date(contents, match=None):
	33	no_match = []
	34	for m in RE_DATE.finditer(contents):
	35	matched_format = m.lastindex
	36	format = DATE_FORMATS[matched_format - 1][0]
	37	# note: spaces in strptime format match zero or more spaces, this is OK
	38	matched = m.group(matched_format).replace(' ', '')
	39	try:
[566]	40	parsed = datetime.strptime(matched, format)
[564]	41	except ValueError, e: # not a date
	42	no_match.append((matched, format, e))
	43	continue
	44	if not match or (match.year, match.month) == (parsed.year, parsed.month):
	45	return parsed.date(), no_match
	46	no_match.append(m.group(matched_format))
	47	return None, no_match
	48
[566]	49	def extract_source(title, hint):
	50	if hint:
	51	return title[:RE_DATE.search(title).start(0)].rstrip()
	52	else:
	53	return title
	54
[564]	55	EagleFiler = app(id='com.c-command.EagleFiler')
	56	Paper = EagleFiler.documents['Paper.eflibrary']
	57
[573]	58	if not Paper.exists():
	59	EagleFiler.open(os.path.expanduser('~/Documents/Paper/Paper.eflibrary'))
[564]	60
[573]	61	def read_sources():
	62	return readPlist(PREFERENCES_PATH).get('Sources', [])
	63
	64	if os.path.exists(PREFERENCES_PATH):
	65	try:
	66	sources = read_sources()
	67	except:
	68	from subprocess import call
	69	call(['plutil', '-convert', 'xml1', PREFERENCES_PATH])
	70	sources = read_sources()
	71	else:
	72	sources = []
	73
[566]	74	def update_all():
	75	for record in Paper.library_records[its.kind=='PDF']():
	76	title = record.title()
	77	hint, no_match = extract_date(title)
	78	source = extract_source(title, hint)
	79
	80	contents = record.contents()
	81	if re.search(re.escape(source), contents, re.IGNORECASE):
	82	if source in sources:
	83	sources.remove(source)
	84	sources.append(source)
	85
	86	extracted, no_match = extract_date(contents, hint)
	87
	88	if not extracted:
	89	print title, hint
	90	for nm in no_match:
	91	print ' no match', nm
	92	if not hint:
	93	continue
	94
	95	record.creation_date.set(extracted or hint)
	96
	97	sources.reverse() # most recently referenced ones at top
	98
	99	def scan_one():
	100	Acrobat = app(id='com.adobe.Acrobat.Pro')
	101	SystemEvents = app(id='com.apple.systemevents')
	102	acro_process = SystemEvents.application_processes[u'Acrobat']
	103
[593]	104	filename = datetime.now().strftime('Scanned Document %y%m%d %H%M%S')
[566]	105
	106	SA = ScriptingAddition()
	107	SA.activate()
[648]	108	while True:
	109	result = SA.display_dialog('How many pages do you wish to scan?',
	110	buttons=['Cancel', 'Scan'],
	111	cancel_button=1, default_button=2,
	112	default_answer='1')
	113	if result is None:
	114	return False
	115	try:
	116	pages = int(result[k.text_returned])
	117	except ValueError:
	118	continue
	119	if pages > 0:
	120	break
[566]	121
	122	Acrobat.activate()
	123
	124	acro_process.menu_bars[1].menu_bar_items['Document'].menus[1].\
	125	menu_items['Scan to PDF...'].click()
	126	acro_process.windows['Acrobat Scan'].buttons['Scan'].click()
	127
	128	# pause (Carbon -> Cocoa? use keystrokes instead?)
	129	acro_process.windows['Save Scanned File As'].text_fields[1].value.\
	130	set(filename)
	131	acro_process.windows['Save Scanned File As'].buttons['Save'].click()
	132
	133	acro_scan_window = acro_process.windows['Acrobat Scan']
	134
	135	while True:
	136	acro_process.windows['DSmobile 600'].buttons['Scan'].click()
	137	while not acro_scan_window.exists():
	138	time.sleep(0.1)
	139
	140	pages -= 1
	141
	142	if pages == 0:
	143	acro_scan_window.groups[1].radio_buttons[2].click()
	144	acro_scan_window.buttons['OK'].click()
	145	break
	146
	147	acro_scan_window.groups[1].radio_buttons[1].click()
	148	acro_scan_window.buttons['OK'].click()
	149
	150	scanned_document = Acrobat.documents['%s.pdf' % filename]
[648]	151	scanned_file = scanned_document.file_alias(timeout=0)
[566]	152	scanned_document.close()
	153
[626]	154	record = Paper.import_(files=[scanned_file], deleting_afterwards=True)[0]
[564]	155	contents = record.contents()
[566]	156	m = re.search('(%s)' % '\|'.join(map(re.escape, sources)), contents,
	157	re.IGNORECASE)
	158	if m:
	159	# use the saved source's case
	160	title = sources[map(str.lower, sources).index(m.group(1).lower())]
	161	else:
	162	title = '???'
[564]	163
[566]	164	extracted, no_match = extract_date(contents)
	165	if extracted:
	166	title += extracted.strftime(' %Y-%m')
	167	record.creation_date.set(extracted)
[564]	168
[566]	169	record.title.set(title)
	170
	171	return True
	172
	173	# update_all()
	174
	175	# XXX incremental source recording from EagleFiler (use tag to record)
	176
	177	while scan_one():
[648]	178	writePlist({'Sources': sources}, PREFERENCES_PATH)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/Update Dates/update_dates.py@ 655

Download in other formats: