Spell checking in Objective-C



I've finally beat python at running the very simple spell checker described in an earlier post. The code is not "pure" Objective-C, some parts are just too slow so I've had to mix in some C.

The major difference compared to the versions described in the earlier post is this addition in the loop of - (NSSet *) knownEdits2:(NSString *) word of an additional condition:
if([ts intersectsSet:knownWords])
[edits2 unionSet:[self edits1:w]];

The intersectsSet condition avoids growing the set of edit distance 2 very effectively - in fact I suspect the python reference code does exactly that - I'm not sure how to read the python construct into distinct loops: return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS) so I can't tell for sure.


CSSpell .h

#import <Cocoa/Cocoa.h>

@interface CSSpell : NSObject
{
NSMutableDictionary * model;
NSDictionary * NWORDS;
NSMutableSet * knownWords;
}
@end


CSSpell.m


#import "CSSpell.h"

@interface CSMutableNumber:NSObject
{
NSUInteger mNumber;
}
@end

@implementation CSMutableNumber
- (id)initWithInt:(int)value
{
self = [super init];
if(!self) return nil;
mNumber = value;
return self;
}
+ (CSMutableNumber *)numberWithInt:(int)value
{
return [[[CSMutableNumber alloc] initWithInt:value] autorelease];
}
- (void)setIntValue:(int)value
{
mNumber = value;
}
- (void) increment
{
if(mNumber < NSUIntegerMax) mNumber++;
}
- (int) intValue
{
return mNumber;
}
@end

@implementation CSSpell

- (NSDictionary *) trainv2:(NSString *)text
{
if(model == nil)
{
model = [NSMutableDictionary dictionaryWithCapacity:40000];
[model retain];
}

NSString * t = [text lowercaseString];
char * buf = (char *)[t cStringUsingEncoding:NSASCIIStringEncoding];

char **bp = &buf;
char *tok;
while (tok = strsep(bp, " \r\t,.*()[]#\n\"-!?&/'~;:"))
if(strlen(tok)>2)
{
NSString * w = [NSString stringWithCString:tok];

CSMutableNumber * wordVal = [model objectForKey:w];
if(wordVal)
[wordVal increment];
else
[model setObject:[CSMutableNumber numberWithInt:1] forKey:w];

}

NSLog(@"Caching set of known words");

knownWords = [NSSet setWithArray:[model allKeys]];
return model;
}



/* returns a set of words at edit distance 1 away */
- (NSSet *) edits1:(NSString *) word
{
NSMutableSet * s = [NSMutableSet setWithCapacity:2000];

int i=0, len;
char c;
#define MAX_LEN_C 500
char str[MAX_LEN_C];
char temp[MAX_LEN_C];


strcpy(str, [word cStringUsingEncoding:NSASCIIStringEncoding]);
for(i=0;i<[word length];i++)
{
strcpy(temp, str); temp[i] = str[i+1]; temp[i+1] = str[i];
len = strlen(temp);

if(i<[word length]-1)
{
NSString * transposed = [NSString stringWithCString:temp length:len];
[s addObject:transposed];
}

strcpy(temp + i, str+i+1);
NSString * deletion = [NSString stringWithCString:temp length:strlen(temp)];
[s addObject:deletion];

for(c='a';c<='z';c++)
{
temp[i] = c; strcpy(temp + i + 1, str+i+1);
NSString * replaced = [NSString stringWithCString:temp length:len];
[s addObject:replaced];

strcpy(temp + i + 1, str+i);
NSString * inserted = [NSString stringWithCString:temp length:len+1];
[s addObject:inserted];
}
}

return s;
}



/* returns a set of known words based at edit distance 2 away */
- (NSSet *) knownEdits2:(NSString *) word
{
NSMutableSet * edits2 = [NSMutableSet setWithCapacity:5000];
NSSet * edits1 = [self edits1:word];

NSEnumerator * en1 = [edits1 objectEnumerator];
NSString * w;
while(w=[en1 nextObject])
{
NSSet * ts = [self edits1:w];


if([ts intersectsSet:knownWords])
[edits2 unionSet:[self edits1:w]];
}

//NSLog(@"Edits2 has %d", [edits2 count]);

[edits2 intersectSet:knownWords];
NSLog(@"KW for %@: %@", word, edits2);

return edits2;
}

/* returns a set of known words from a given set of words */
- (NSSet *) known:(NSSet *) words
{
NSMutableSet * s = [NSMutableSet setWithCapacity:100];
[s unionSet:words];
[s intersectSet:knownWords];
return s;
}

- (NSString *) correct:(NSString *) word
{
if([word length] > 300)
return nil;


NSMutableSet * candidates = [NSMutableSet setWithCapacity:10];

if([candidates count] < 1)
{
[candidates unionSet:[self known:[NSSet setWithObject:word]]];
//NSLog(@"Adding known words set: %@", candidates);
}

if([candidates count] < 1)
{
[candidates unionSet:[self known:[self edits1:word]]];
//NSLog(@"Adding edit distance 1 set: %@", candidates);
}

if([candidates count] < 1)
{
[candidates unionSet:[self knownEdits2:word]];
}
if([candidates count] < 1)
{
//NSLog(@"Adding the word itself");
[candidates addObject:word];
}

int max=0;
NSString * chosen = @"Unknown#";
NSEnumerator * e = [candidates objectEnumerator];
NSString * w;
while(w=[e nextObject])
{
CSMutableNumber * wordVal = [NWORDS objectForKey:w];
if(wordVal)
{
if([wordVal intValue] > max)
{
chosen = w;
max = [wordVal intValue];
}
}
}


return chosen;
}

- (void) awakeFromNib
{

NSString * cnt = [NSString stringWithContentsOfFile:@"/Users/diciu/Desktop/big.txt" encoding:NSASCIIStringEncoding error:nil];
NSLog(@"Dictionary has been loaded");

NWORDS = [self trainv2:cnt];
NSLog(@"Training done, %d words", [NWORDS count]);

NSDate * startDate = [NSDate date];

NSArray * tests = [NSArray arrayWithObjects:@"generataed", @"acount", @"guidlines",
@"wheere", @"myne", @"graet", @"silenc", @"aggresive",
@"appreceiated", @"aquantance", @"beggining", nil];
NSEnumerator * en = [tests objectEnumerator];
NSString * c;
while(c=[en nextObject])
{
NSDate * sd = [NSDate date];
NSString * correct = [self correct:c];
NSLog(@"%@ / %@ \t\t\t in %f seconds", c, correct, [sd timeIntervalSinceNow]);
}


NSLog(@"Total runtime: %f", [startDate timeIntervalSinceNow]);

}
@end