Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/Update Dates/update_dates.py@ 564

Last change on this file since 564 was 564, checked in by Nicholas Riley, 15 years ago
Update Dates: first pass at auto-naming script for scanned PDFs in EagleFiler
File size: 2.5 KB

Rev	Line
[564]	1	from appscript import *
	2	import re
	3
	4	DATE_FORMATS = (('%m/%d/%y', r'\d{1,2}/\d{1,2}/\d{1,2}' ), # T-Mobile
	5	('%m.%d.%y', r'\d{1,2}\.\d{1,2}\.\d{1,2}' ), # iFixit
	6	('%b %d, %Y', r'[A-Z][a-z][a-z] \d{1,2}, \d{4}'), # AmerenIP
	7	('%B %d, %Y', r'[A-Z][a-z]+ ?\d{1,2}, ?\d{4}' ), # Amazon
	8	('of %Y%m%d', r'of \d{8}' ), # Amazon
	9	('%m/%d/%Y', r'\d{1,2}/\d{1,2}/\d{4}' ), # Busey
	10	('%b %d %Y', r'[A-Z]{3} \d{1,2} \d{4}' ), # State Farm
	11	('%d %b %Y', r'\d{1,2} [A-Z][A-Za-z]{2} \d{4}'), # Apple
	12	('%Y-%m-%d', r'\d{4}-\d{2}-\d{2}' ), # MacSpeech
	13	('%Y-%m', r'\d{4}-\d{2}' ), # filename
	14	('%m1%d/%y', r'\d{1,2}1\d{1,2}/\d{1,2}' ), # T-Mo bad OCR
	15	('%m/%d1%y', r'\d{1,2}/\d{1,2}1\d{1,2}' ), # T-Mo bad OCR
	16	('%m/%d/%y', r'\d{1,2}/ \d{1,2}/ \d{1,2}' ), # T-Mo bad OCR
	17	('%m/%d/%Y',
	18	r'(?:\d ?){1,2}/ (?:\d ?){1,2}/ (?:\d ?){4}' ), # Busey bad OCR
	19	)
	20
	21	RE_DATE = re.compile('\|'.join(r'(\b%s\b)' % regex
	22	for format, regex in DATE_FORMATS))
	23
	24	def extract_date(contents, match=None):
	25	no_match = []
	26	for m in RE_DATE.finditer(contents):
	27	matched_format = m.lastindex
	28	format = DATE_FORMATS[matched_format - 1][0]
	29	# note: spaces in strptime format match zero or more spaces, this is OK
	30	matched = m.group(matched_format).replace(' ', '')
	31	try:
	32	parsed = datetime.datetime.strptime(matched, format)
	33	except ValueError, e: # not a date
	34	no_match.append((matched, format, e))
	35	continue
	36	if not match or (match.year, match.month) == (parsed.year, parsed.month):
	37	return parsed.date(), no_match
	38	no_match.append(m.group(matched_format))
	39	return None, no_match
	40
	41	EagleFiler = app(id='com.c-command.EagleFiler')
	42	Paper = EagleFiler.documents['Paper.eflibrary']
	43
	44	for record in Paper.library_records[its.kind=='PDF']():
	45	title = record.title()
	46	hint, no_match = extract_date(title)
	47
	48	contents = record.contents()
	49	extracted, no_match = extract_date(contents, hint)
	50
	51	if not extracted:
	52	print title, hint
	53	for nm in no_match:
	54	print ' no match', nm
	55	if not hint:
	56	continue
	57
	58	record.creation_date.set(extracted or hint)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: