source: trunk/Update Dates/update_dates.py@ 564

Last change on this file since 564 was 564, checked in by Nicholas Riley, 15 years ago

Update Dates: first pass at auto-naming script for scanned PDFs in EagleFiler

File size: 2.5 KB
Line 
1from appscript import *
2import re
3
4DATE_FORMATS = (('%m/%d/%y', r'\d{1,2}/\d{1,2}/\d{1,2}' ), # T-Mobile
5 ('%m.%d.%y', r'\d{1,2}\.\d{1,2}\.\d{1,2}' ), # iFixit
6 ('%b %d, %Y', r'[A-Z][a-z][a-z] \d{1,2}, \d{4}'), # AmerenIP
7 ('%B %d, %Y', r'[A-Z][a-z]+ ?\d{1,2}, ?\d{4}' ), # Amazon
8 ('of %Y%m%d', r'of \d{8}' ), # Amazon
9 ('%m/%d/%Y', r'\d{1,2}/\d{1,2}/\d{4}' ), # Busey
10 ('%b %d %Y', r'[A-Z]{3} \d{1,2} \d{4}' ), # State Farm
11 ('%d %b %Y', r'\d{1,2} [A-Z][A-Za-z]{2} \d{4}'), # Apple
12 ('%Y-%m-%d', r'\d{4}-\d{2}-\d{2}' ), # MacSpeech
13 ('%Y-%m', r'\d{4}-\d{2}' ), # filename
14 ('%m1%d/%y', r'\d{1,2}1\d{1,2}/\d{1,2}' ), # T-Mo bad OCR
15 ('%m/%d1%y', r'\d{1,2}/\d{1,2}1\d{1,2}' ), # T-Mo bad OCR
16 ('%m/%d/%y', r'\d{1,2}/ \d{1,2}/ \d{1,2}' ), # T-Mo bad OCR
17 ('%m/%d/%Y',
18 r'(?:\d ?){1,2}/ (?:\d ?){1,2}/ (?:\d ?){4}' ), # Busey bad OCR
19 )
20
21RE_DATE = re.compile('|'.join(r'(\b%s\b)' % regex
22 for format, regex in DATE_FORMATS))
23
24def extract_date(contents, match=None):
25 no_match = []
26 for m in RE_DATE.finditer(contents):
27 matched_format = m.lastindex
28 format = DATE_FORMATS[matched_format - 1][0]
29 # note: spaces in strptime format match zero or more spaces, this is OK
30 matched = m.group(matched_format).replace(' ', '')
31 try:
32 parsed = datetime.datetime.strptime(matched, format)
33 except ValueError, e: # not a date
34 no_match.append((matched, format, e))
35 continue
36 if not match or (match.year, match.month) == (parsed.year, parsed.month):
37 return parsed.date(), no_match
38 no_match.append(m.group(matched_format))
39 return None, no_match
40
41EagleFiler = app(id='com.c-command.EagleFiler')
42Paper = EagleFiler.documents['Paper.eflibrary']
43
44for record in Paper.library_records[its.kind=='PDF']():
45 title = record.title()
46 hint, no_match = extract_date(title)
47
48 contents = record.contents()
49 extracted, no_match = extract_date(contents, hint)
50
51 if not extracted:
52 print title, hint
53 for nm in no_match:
54 print ' no match', nm
55 if not hint:
56 continue
57
58 record.creation_date.set(extracted or hint)
Note: See TracBrowser for help on using the repository browser.