source: trunk/Update Dates/update_dates.py@ 651

Last change on this file since 651 was 648, checked in by Nicholas Riley, 12 years ago

update_dates.py: Don't time out while waiting for a document to OCR.

Fix a longstanding issue where clicking "Cancel" raised an exception.

Also write plist on every scanned document (this change was made quite
a while ago but not checked in until now).

File size: 6.1 KB
Line 
1from appscript import *
2from datetime import datetime
3from osax import *
4from plistlib import readPlist, writePlist
5import os
6import re
7import time
8
9PREFERENCES_PATH = \
10 os.path.expanduser('~/Library/Preferences/net.sabi.UpdateDates.plist')
11
12DATE_FORMATS = (('%m/%d/%y', r'\d{1,2}/\d{1,2}/\d{1,2}' ), # T-Mobile
13 ('%m.%d.%y', r'\d{1,2}\.\d{1,2}\.\d{1,2}' ), # iFixit
14 ('%b %d, %Y', r'[A-Z][a-z][a-z] \d{1,2}, \d{4}'), # AmerenIP
15 ('%B %d, %Y', r'[A-Z][a-z]+ ?\d{1,2}, ?\d{4}' ), # Amazon
16 ('of %Y%m%d', r'of \d{8}' ), # Amazon
17 ('%m/%d/%Y', r'\d{1,2}/\d{1,2}/\d{4}' ), # Busey
18 ('%b %d %Y', r'[A-Z]{3} \d{1,2} \d{4}' ), # State Farm
19 ('%d %b %Y', r'\d{1,2} [A-Z][A-Za-z]{2} \d{4}'), # Apple
20 ('%Y-%m-%d', r'\d{4}-\d{2}-\d{2}' ), # MacSpeech
21 ('%Y-%m', r'\d{4}-\d{2}' ), # filename
22 ('%m1%d/%y', r'\d{1,2}1\d{1,2}/\d{1,2}' ), # T-Mo bad OCR
23 ('%m/%d1%y', r'\d{1,2}/\d{1,2}1\d{1,2}' ), # T-Mo bad OCR
24 ('%m/%d/%y', r'\d{1,2}/ \d{1,2}/ \d{1,2}' ), # T-Mo bad OCR
25 ('%m/%d/%Y',
26 r'(?:\d ?){1,2}/ (?:\d ?){1,2}/ (?:\d ?){4}' ), # Busey bad OCR
27 )
28
29RE_DATE = re.compile('|'.join(r'(\b%s\b)' % regex
30 for format, regex in DATE_FORMATS))
31
32def extract_date(contents, match=None):
33 no_match = []
34 for m in RE_DATE.finditer(contents):
35 matched_format = m.lastindex
36 format = DATE_FORMATS[matched_format - 1][0]
37 # note: spaces in strptime format match zero or more spaces, this is OK
38 matched = m.group(matched_format).replace(' ', '')
39 try:
40 parsed = datetime.strptime(matched, format)
41 except ValueError, e: # not a date
42 no_match.append((matched, format, e))
43 continue
44 if not match or (match.year, match.month) == (parsed.year, parsed.month):
45 return parsed.date(), no_match
46 no_match.append(m.group(matched_format))
47 return None, no_match
48
49def extract_source(title, hint):
50 if hint:
51 return title[:RE_DATE.search(title).start(0)].rstrip()
52 else:
53 return title
54
55EagleFiler = app(id='com.c-command.EagleFiler')
56Paper = EagleFiler.documents['Paper.eflibrary']
57
58if not Paper.exists():
59 EagleFiler.open(os.path.expanduser('~/Documents/Paper/Paper.eflibrary'))
60
61def read_sources():
62 return readPlist(PREFERENCES_PATH).get('Sources', [])
63
64if os.path.exists(PREFERENCES_PATH):
65 try:
66 sources = read_sources()
67 except:
68 from subprocess import call
69 call(['plutil', '-convert', 'xml1', PREFERENCES_PATH])
70 sources = read_sources()
71else:
72 sources = []
73
74def update_all():
75 for record in Paper.library_records[its.kind=='PDF']():
76 title = record.title()
77 hint, no_match = extract_date(title)
78 source = extract_source(title, hint)
79
80 contents = record.contents()
81 if re.search(re.escape(source), contents, re.IGNORECASE):
82 if source in sources:
83 sources.remove(source)
84 sources.append(source)
85
86 extracted, no_match = extract_date(contents, hint)
87
88 if not extracted:
89 print title, hint
90 for nm in no_match:
91 print ' no match', nm
92 if not hint:
93 continue
94
95 record.creation_date.set(extracted or hint)
96
97 sources.reverse() # most recently referenced ones at top
98
99def scan_one():
100 Acrobat = app(id='com.adobe.Acrobat.Pro')
101 SystemEvents = app(id='com.apple.systemevents')
102 acro_process = SystemEvents.application_processes[u'Acrobat']
103
104 filename = datetime.now().strftime('Scanned Document %y%m%d %H%M%S')
105
106 SA = ScriptingAddition()
107 SA.activate()
108 while True:
109 result = SA.display_dialog('How many pages do you wish to scan?',
110 buttons=['Cancel', 'Scan'],
111 cancel_button=1, default_button=2,
112 default_answer='1')
113 if result is None:
114 return False
115 try:
116 pages = int(result[k.text_returned])
117 except ValueError:
118 continue
119 if pages > 0:
120 break
121
122 Acrobat.activate()
123
124 acro_process.menu_bars[1].menu_bar_items['Document'].menus[1].\
125 menu_items['Scan to PDF...'].click()
126 acro_process.windows['Acrobat Scan'].buttons['Scan'].click()
127
128 # pause (Carbon -> Cocoa? use keystrokes instead?)
129 acro_process.windows['Save Scanned File As'].text_fields[1].value.\
130 set(filename)
131 acro_process.windows['Save Scanned File As'].buttons['Save'].click()
132
133 acro_scan_window = acro_process.windows['Acrobat Scan']
134
135 while True:
136 acro_process.windows['DSmobile 600'].buttons['Scan'].click()
137 while not acro_scan_window.exists():
138 time.sleep(0.1)
139
140 pages -= 1
141
142 if pages == 0:
143 acro_scan_window.groups[1].radio_buttons[2].click()
144 acro_scan_window.buttons['OK'].click()
145 break
146
147 acro_scan_window.groups[1].radio_buttons[1].click()
148 acro_scan_window.buttons['OK'].click()
149
150 scanned_document = Acrobat.documents['%s.pdf' % filename]
151 scanned_file = scanned_document.file_alias(timeout=0)
152 scanned_document.close()
153
154 record = Paper.import_(files=[scanned_file], deleting_afterwards=True)[0]
155 contents = record.contents()
156 m = re.search('(%s)' % '|'.join(map(re.escape, sources)), contents,
157 re.IGNORECASE)
158 if m:
159 # use the saved source's case
160 title = sources[map(str.lower, sources).index(m.group(1).lower())]
161 else:
162 title = '???'
163
164 extracted, no_match = extract_date(contents)
165 if extracted:
166 title += extracted.strftime(' %Y-%m')
167 record.creation_date.set(extracted)
168
169 record.title.set(title)
170
171 return True
172
173# update_all()
174
175# XXX incremental source recording from EagleFiler (use tag to record)
176
177while scan_one():
178 writePlist({'Sources': sources}, PREFERENCES_PATH)
Note: See TracBrowser for help on using the repository browser.