[564] | 1 | from appscript import *
|
---|
| 2 | import re
|
---|
| 3 |
|
---|
| 4 | DATE_FORMATS = (('%m/%d/%y', r'\d{1,2}/\d{1,2}/\d{1,2}' ), # T-Mobile
|
---|
| 5 | ('%m.%d.%y', r'\d{1,2}\.\d{1,2}\.\d{1,2}' ), # iFixit
|
---|
| 6 | ('%b %d, %Y', r'[A-Z][a-z][a-z] \d{1,2}, \d{4}'), # AmerenIP
|
---|
| 7 | ('%B %d, %Y', r'[A-Z][a-z]+ ?\d{1,2}, ?\d{4}' ), # Amazon
|
---|
| 8 | ('of %Y%m%d', r'of \d{8}' ), # Amazon
|
---|
| 9 | ('%m/%d/%Y', r'\d{1,2}/\d{1,2}/\d{4}' ), # Busey
|
---|
| 10 | ('%b %d %Y', r'[A-Z]{3} \d{1,2} \d{4}' ), # State Farm
|
---|
| 11 | ('%d %b %Y', r'\d{1,2} [A-Z][A-Za-z]{2} \d{4}'), # Apple
|
---|
| 12 | ('%Y-%m-%d', r'\d{4}-\d{2}-\d{2}' ), # MacSpeech
|
---|
| 13 | ('%Y-%m', r'\d{4}-\d{2}' ), # filename
|
---|
| 14 | ('%m1%d/%y', r'\d{1,2}1\d{1,2}/\d{1,2}' ), # T-Mo bad OCR
|
---|
| 15 | ('%m/%d1%y', r'\d{1,2}/\d{1,2}1\d{1,2}' ), # T-Mo bad OCR
|
---|
| 16 | ('%m/%d/%y', r'\d{1,2}/ \d{1,2}/ \d{1,2}' ), # T-Mo bad OCR
|
---|
| 17 | ('%m/%d/%Y',
|
---|
| 18 | r'(?:\d ?){1,2}/ (?:\d ?){1,2}/ (?:\d ?){4}' ), # Busey bad OCR
|
---|
| 19 | )
|
---|
| 20 |
|
---|
| 21 | RE_DATE = re.compile('|'.join(r'(\b%s\b)' % regex
|
---|
| 22 | for format, regex in DATE_FORMATS))
|
---|
| 23 |
|
---|
| 24 | def extract_date(contents, match=None):
|
---|
| 25 | no_match = []
|
---|
| 26 | for m in RE_DATE.finditer(contents):
|
---|
| 27 | matched_format = m.lastindex
|
---|
| 28 | format = DATE_FORMATS[matched_format - 1][0]
|
---|
| 29 | # note: spaces in strptime format match zero or more spaces, this is OK
|
---|
| 30 | matched = m.group(matched_format).replace(' ', '')
|
---|
| 31 | try:
|
---|
| 32 | parsed = datetime.datetime.strptime(matched, format)
|
---|
| 33 | except ValueError, e: # not a date
|
---|
| 34 | no_match.append((matched, format, e))
|
---|
| 35 | continue
|
---|
| 36 | if not match or (match.year, match.month) == (parsed.year, parsed.month):
|
---|
| 37 | return parsed.date(), no_match
|
---|
| 38 | no_match.append(m.group(matched_format))
|
---|
| 39 | return None, no_match
|
---|
| 40 |
|
---|
| 41 | EagleFiler = app(id='com.c-command.EagleFiler')
|
---|
| 42 | Paper = EagleFiler.documents['Paper.eflibrary']
|
---|
| 43 |
|
---|
| 44 | for record in Paper.library_records[its.kind=='PDF']():
|
---|
| 45 | title = record.title()
|
---|
| 46 | hint, no_match = extract_date(title)
|
---|
| 47 |
|
---|
| 48 | contents = record.contents()
|
---|
| 49 | extracted, no_match = extract_date(contents, hint)
|
---|
| 50 |
|
---|
| 51 | if not extracted:
|
---|
| 52 | print title, hint
|
---|
| 53 | for nm in no_match:
|
---|
| 54 | print ' no match', nm
|
---|
| 55 | if not hint:
|
---|
| 56 | continue
|
---|
| 57 |
|
---|
| 58 | record.creation_date.set(extracted or hint)
|
---|