source: trunk/ICeCoffEE/ICeCoffEE/ICeCoffEEParser.m @ 473

Last change on this file since 473 was 473, checked in by Nicholas Riley, 12 years ago

Fix asymmetric parsing of single words wrapped in ambiguous delimiters. Strip trailing exclamation points like commas, semicolons, periods and commas.

File size: 10.3 KB
Line 
1//
2//  ICeCoffEEParser.m
3//  ICeCoffEE
4//
5//  Created by Nicholas Riley on 6/21/07.
6//  Copyright 2007 Nicholas Riley. All rights reserved.
7//
8
9#import "ICeCoffEEParser.h"
10#import "ICeCoffEE.h"
11
12void ICCF_Delimiters(NSCharacterSet **leftPtr, NSCharacterSet **rightPtr) {
13    static NSCharacterSet *urlLeftDelimiters = nil, *urlRightDelimiters = nil;
14   
15    if (urlLeftDelimiters == nil || urlRightDelimiters == nil) {
16        NSMutableCharacterSet *set = [[NSCharacterSet whitespaceAndNewlineCharacterSet] mutableCopy];
17        NSMutableCharacterSet *tmpSet;
18        [urlLeftDelimiters release];
19        [urlRightDelimiters release];
20       
21        [set autorelease];
22        [set formUnionWithCharacterSet: [[NSCharacterSet characterSetWithRange: NSMakeRange(0x21, 0x5e)] invertedSet]]; // nonprintable and non-ASCII characters
23        [set formUnionWithCharacterSet: [NSCharacterSet punctuationCharacterSet]];
24        // XXX obsoleted by RFC 3986 now... use §2.1, 2.2, 2.3
25        [set removeCharactersInString: @";/?:@&=+$,-_.!~*'(){}[]%#"]; // RFC 2396 §2.2, 2.3, 2.4, plus % and # from "delims" set and {}, []
26       
27        tmpSet = [[set mutableCopy] autorelease];
28        [tmpSet formUnionWithCharacterSet: [NSCharacterSet characterSetWithCharactersInString: @"<(["]];
29        urlLeftDelimiters = [tmpSet copy]; // make immutable again - for efficiency
30       
31        tmpSet = [[set mutableCopy] autorelease];
32        [tmpSet formUnionWithCharacterSet: [NSCharacterSet characterSetWithCharactersInString: @">)]"]];
33        urlRightDelimiters = [tmpSet copy]; // make immutable again - for efficiency
34    }
35   
36    *leftPtr = urlLeftDelimiters; *rightPtr = urlRightDelimiters;
37}
38
39static ICInstance ICCF_icInst = NULL;
40
41void ICCF_StartIC() {
42    OSStatus err;
43   
44    if (ICCF_icInst != NULL) {
45        ICLog(@"ICCF_StartIC: Internet Config is already running!");
46        ICCF_StopIC();
47    }
48    err = ICStart(&ICCF_icInst, kICCFCreator);
49    NSCAssert1(err == noErr, ICCF_LocalizedString(@"Unable to start Internet Config (error %d)"), err);
50}
51
52void ICCF_StopIC() {
53    if (ICCF_icInst == NULL) {
54        ICLog(@"ICCF_StopIC: Internet Config is not running!");
55    } else {
56        ICStop(ICCF_icInst);
57        ICCF_icInst = NULL;
58    }
59}
60
61ICInstance ICCF_GetInst() {
62    NSCAssert(ICCF_icInst != NULL, @"Internal error: Called ICCF_GetInst without ICCF_StartIC");
63    return ICCF_icInst;
64}
65
66// input/output 'range' is the range of source document which contains 'string'
67void ICCF_ParseURL(NSString *string, NSRange *range) {
68    OSStatus err;
69    Handle h;
70    long selStart = 0, selEnd = range->length; // local offsets within 'string'
71    char *urlData = NULL;
72   
73    NSCAssert(selEnd == [string length], @"Internal error: URL string is wrong length");
74   
75    @try {
76        if ([[NSCharacterSet characterSetWithCharactersInString: @";,.!"] characterIsMember:
77            [string characterAtIndex: selEnd - 1]]) {
78            selEnd--;
79        }
80        NSCharacterSet *alphanumericCharacterSet = [NSCharacterSet alphanumericCharacterSet];
81        unichar opening, closing;
82        while (![alphanumericCharacterSet characterIsMember:
83                 (opening = [string characterAtIndex: selStart])]) {
84            closing = [string characterAtIndex: selEnd - 1];
85            if ((opening == '(' && closing == ')') ||
86                (opening == '{' && closing == '}') ||
87                (opening == '[' && closing == ']')) {
88                selEnd--;
89            }
90            selStart++;
91            NSCAssert(selStart < selEnd, @"No URL is selected");
92        }
93       
94        string = [string substringWithRange: NSMakeRange(selStart, selEnd - selStart)];
95       
96        ICLog(@"Parsing URL |%@|", string);
97       
98        NSCAssert([string canBeConvertedToEncoding: NSASCIIStringEncoding], @"No URL is selected");
99       
100        urlData = (char *)malloc( (range->length + 1) * sizeof(char));
101        NSCAssert(urlData != NULL, @"Internal error: can't allocate memory for URL string");
102       
103        // XXX getCString: is deprecated in 10.4, but this is safe and shouldn't assert because we've already verified the string can be converted to ASCII, which should be a subset of any possible system encoding.  The replacement (getCString:maxLength:encoding:) is not available until 10.4, so we leave this until we dump Internet Config and gain IDN friendliness.
104        [string getCString: urlData];
105       
106        h = NewHandle(0);
107        NSCAssert(h != NULL, @"Internal error: can't allocate URL handle");
108       
109        err = ICParseURL(ICCF_GetInst(), "\pmailto", urlData, range->length, &selStart, &selEnd, h);
110        DisposeHandle(h);
111       
112        ICCF_OSErrCAssert(err, @"ICParseURL");
113       
114        range->length = selEnd - selStart;
115        range->location += selStart;
116    } @finally {
117        free(urlData);
118    }
119}
120
121static BOOL ICCF_StringIncludesCharacter(NSString *s, unichar character, NSRange range) {
122    NSRange result = [s rangeOfCharacterFromSet: [NSCharacterSet characterSetWithCharactersInString:
123                                         [NSString stringWithCharacters: &character length: 1]]
124                                        options: NSLiteralSearch range: range];
125    return (result.location != NSNotFound);
126}
127
128static BOOL ICCF_IsLikelyURI(NSString *s, NSRange range) {
129    return ([s rangeOfCharacterFromSet: [NSCharacterSet characterSetWithCharactersInString: @":/.@"]
130                               options: NSLiteralSearch range: range].location != NSNotFound);
131}
132
133static BOOL ICCF_IsLikelyIPv6Address(NSString *s, NSRange range) {
134    return ([s rangeOfCharacterFromSet:
135             [[NSCharacterSet characterSetWithCharactersInString: @"ABCDEFabcdef0123456789:"] invertedSet]
136                               options: NSLiteralSearch range: range].location == NSNotFound);
137}
138
139NSRange ICCF_URLEnclosingRange(NSString *s, NSRange range) {
140    NSCharacterSet *urlLeftDelimiters = nil, *urlRightDelimiters = nil;
141    NSRange delimiterRange;
142    unsigned extraLen;
143    BOOL multiLine = NO;
144   
145    ICCF_CheckRange(range);
146   
147    ICCF_Delimiters(&urlLeftDelimiters, &urlRightDelimiters);
148   
149    // right delimiter selected?  Yes, this can break with ...)URL(....  Oh well.
150    if (range.location > 0 && [urlRightDelimiters characterIsMember: [s characterAtIndex: range.location]]) {
151        --range.location;
152        ++range.length;
153        ICLog(@"expanding past initial %c, now |%@|", [s characterAtIndex: range.location + 1],
154              [s substringWithRange: range]);
155    }
156       
157expandFront:
158    // XXX instead of 0, make this stop at the max URL length to prevent protracted searches
159    // XXX backport to ICeCoffEETerminal
160    // add 1 to range to trap delimiters that are on the edge of the selection (i.e., <...)
161    delimiterRange = [s rangeOfCharacterFromSet: urlLeftDelimiters
162                                        options: NSLiteralSearch | NSBackwardsSearch
163                                          range: NSMakeRange(0, range.location + (range.location != [s length]))];
164    if (delimiterRange.location == NSNotFound) {
165        // extend to beginning of string
166        range.length += range.location;
167        range.location = 0;
168    } else {
169        NSCAssert(delimiterRange.length == 1, @"Internal error: delimiter matched range is not of length 1");
170        if ([s characterAtIndex: delimiterRange.location] == '<') { // XXX move to expandBoth to handle clicking in middle
171            multiLine = YES;
172            urlRightDelimiters = [NSCharacterSet characterSetWithCharactersInString: @">"];
173        }
174        range.length += range.location - delimiterRange.location - 1;
175        range.location = delimiterRange.location + 1;
176    }
177   
178
179expandBack:
180    // XXX instead of length of string, make this stop at the max URL length to prevent protracted searches
181    // add 1 to range to trap delimiters that are on the edge of the selection (i.e., ...>)
182    extraLen = [s length] - range.location - range.length;
183    delimiterRange = [s rangeOfCharacterFromSet: urlRightDelimiters
184                                        options: NSLiteralSearch
185                                          range: NSMakeRange(range.location + range.length - (range.length != 0),
186                                                             extraLen + (range.length != 0))];
187    if (delimiterRange.location == NSNotFound) {
188        // extend to end of string
189        range.length += extraLen;
190        extraLen = 0;
191    } else {
192        NSCAssert(delimiterRange.length == 1, @"Internal error: delimiter matched range is not of length 1");
193        range.length += delimiterRange.location - range.location - range.length;
194        extraLen = [s length] - NSMaxRange(range);
195
196        unichar opening, closing = [s characterAtIndex: delimiterRange.location];
197        if (closing == '>' && !multiLine && ICCF_StringIncludesCharacter(s, '<', NSMakeRange(0, range.location))) {
198            urlLeftDelimiters = [NSCharacterSet characterSetWithCharactersInString: @"<"];
199            goto expandFront; // XXX move to expandBoth to handle clicking in middle
200        }
201        // grow URL past closing paren/brace/bracket if we've seen an open paren/brace/bracket
202        if (closing == ')') opening = '(';
203        else if (closing == '}') opening = '{';
204        else if (closing == ']') opening = '[';
205        else goto expandBoth;
206        if (!ICCF_StringIncludesCharacter(s, opening, range))
207            goto expandBoth;
208       
209        if (extraLen == 1) {
210            range.length += 1;
211            --extraLen;
212            ICLog(@"expanding past %c, now |%@|", closing, [s substringWithRange: range]);
213        } else {
214            range.length += 2;
215            ICLog(@"expanding past %c, now |%@|", closing, [s substringWithRange: range]);
216            goto expandBack;
217        }
218    }
219   
220expandBoth:
221    if (range.location <= 1)
222        goto checkRange; // nowhere to expand
223    unichar opening = [s characterAtIndex: range.location - 1], closing;
224    if (opening == '(') closing = ')';
225    else if (opening == '{') closing = '}';
226    else if (opening == '[') closing = ']';
227    else goto checkRange;
228
229    ICLog(@"extraLen = %d", extraLen);
230    // check if we're inside a partial delimited URL: not foolproof, but handles (foo), {UUID} and [IPv6]
231    if (delimiterRange.location != NSNotFound && [s characterAtIndex: delimiterRange.location] == closing &&
232        ((opening == '[' && ICCF_IsLikelyIPv6Address(s, range)) || !ICCF_IsLikelyURI(s, range))) {
233        ICLog(@"expanding past %c...%c, was |%@|", opening, closing, [s substringWithRange: range]);
234        range.location -= 2;
235        if (extraLen > 1)
236            range.length += 4;
237        else
238            range.length += 2 + extraLen;
239        ICLog(@"expanding past %c...%c, now |%@|", opening, closing, [s substringWithRange: range]);
240        goto expandFront;
241    }
242
243    if (ICCF_StringIncludesCharacter(s, closing, range) &&
244        ((opening == '[' &&
245          ICCF_IsLikelyIPv6Address(s, NSMakeRange(range.location,
246                                                  [s rangeOfString: @"]"].location - range.location)))
247         || !ICCF_IsLikelyURI(s, range))) {
248        range.location -= 2;
249        range.length += 2;
250        ICLog(@"expanding past %c, now |%@|", opening, [s substringWithRange: range]);
251        goto expandFront;
252    }
253
254checkRange:
255    ICCF_CheckRange(range);
256   
257    ICCF_ParseURL([s substringWithRange: range], &range);
258   
259    return range;
260}
Note: See TracBrowser for help on using the repository browser.