1/* 2 * Copyright 2002-2011, Haiku, Inc. All rights reserved. 3 * Copyright 2002 Alexander G. M. Smith. 4 * Copyright 2011, Clemens Zeidler <haiku@clemens-zeidler.de> 5 * Distributed under the terms of the MIT License. 6 */ 7/****************************************************************************** 8 * $Id: SpamFilter.cpp 29284 2009-02-22 13:45:40Z bga $ 9 * 10 * SpamFilter - Uses Bayesian statistics to evaluate the spaminess of a 11 * message. The evaluation is done by a separate server, this add-on just gets 12 * the text and uses scripting commands to get an evaluation from the server. 13 * If the server isn't running, it will be found and started up. Once the 14 * evaluation has been received, it is added to the message as an attribute and 15 * optionally as an addition to the subject. Some other add-on later in the 16 * pipeline will use the attribute to delete the message or move it to some 17 * other folder. 18 * 19 * Public Domain 2002, by Alexander G. M. Smith, no warranty. 20 * 21 * $Log: SpamFilter.cpp,v $ (SVN doesn't support log messages so manually done) 22 * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line 23 * Move trunk into respective module. 24 * 25 * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines 26 * Added AGMS's excellent spam detection software. Still some weirdness with 27 * the configuration interface from E-mail prefs. 28 * 29 * r9669 | brunoga | 2004-10-30 18:23:26 -0400 (Sat, 30 Oct 2004) | 2 lines 30 * AGMS Spam Filter. 31 * 32 * Revision 1.19 2004/09/20 15:57:30 nwhitehorn 33 * Mostly updated the tree to Be/Haiku style identifier naming conventions. I have a few more things to work out, mostly in mail_util.h, and then I'm proceeding to jamify the build system. Then we go into Haiku CVS. 34 * 35 * Revision 1.18 2003/09/20 12:39:27 agmsmith 36 * Memory leak delete needs [] bug. 37 * 38 * Revision 1.17 2003/07/08 21:12:47 agmsmith 39 * Changed other spam filter defaults to values I find useful. 40 * 41 * Revision 1.16 2003/07/08 20:56:40 agmsmith 42 * Turn on auto-training for the spam filter by default. 43 * 44 * Revision 1.15 2003/07/06 13:30:33 agmsmith 45 * Make sure that the spam filter doesn't auto-train the message twice 46 * when it gets a partially downloaded e-mail (will just train on the 47 * partial one, ignore the complete message when it gets downloaded). 48 * 49 * Revision 1.14 2003/05/27 17:12:59 nwhitehorn 50 * Massive refactoring of the Protocol/ChainRunner/Filter system. You can probably 51 * examine its scope by examining the number of files changed. Regardless, this is 52 * preparation for lots of new features, and REAL WORKING IMAP. Yes, you heard me. 53 * Enjoy, and prepare for bugs (although I've fixed all the ones I've found, I susp 54 * ect there are some memory leaks in ChainRunner). 55 * 56 * Revision 1.13 2003/02/08 21:54:17 agmsmith 57 * Updated the AGMSBayesianSpamServer documentation to match the current 58 * version. Also removed the Beep options from the spam filter, now they 59 * are turned on or off in the system sound preferences. 60 * 61 * Revision 1.12 2002/12/18 02:27:45 agmsmith 62 * Added uncertain classification as suggested by BiPolar. 63 * 64 * Revision 1.11 2002/12/16 16:03:20 agmsmith 65 * Changed spam cutoff to 0.95 to work with default Chi-Squared scoring. 66 * 67 * Revision 1.10 2002/12/13 22:04:42 agmsmith 68 * Changed default to turn on the Spam marker in the subject. 69 * 70 * Revision 1.9 2002/12/13 20:27:44 agmsmith 71 * Added auto-training mode to the filter. It evaluates a message for 72 * spaminess then recursively adds it to the database. This can lead 73 * to weird results unless the user corrects the bad classifications. 74 * 75 * Revision 1.8 2002/11/28 20:20:57 agmsmith 76 * Now checks if the spam database is running in headers only mode, and 77 * then only downloads headers if that is the case. 78 * 79 * Revision 1.7 2002/11/10 19:36:26 agmsmith 80 * Retry launching server a few times, but not too many. 81 * 82 * Revision 1.6 2002/11/03 02:21:02 agmsmith 83 * Never mind, just use the SourceForge version numbers. Ugh. 84 * 85 * Revision 1.8 2002/10/21 16:12:09 agmsmith 86 * Added option for spam if no words found, use new method of saving 87 * the attribute which avoids hacking the rest of the mail system. 88 * 89 * Revision 1.7 2002/10/11 20:01:28 agmsmith 90 * Added sound effects (system beep) for genuine and spam, plus config option 91 * for it. 92 * 93 * Revision 1.6 2002/10/01 00:45:34 agmsmith 94 * Changed default spam ratio to 0.56 from 0.9, for use with 95 * the Gary Robinson method in AGMSBayesianSpamServer 1.49. 96 * 97 * Revision 1.5 2002/09/25 13:23:21 agmsmith 98 * Don't leave the data stream at the initial position, try leaving it 99 * at the end. Was having mail progress bar problems. 100 * 101 * Revision 1.4 2002/09/23 19:14:13 agmsmith 102 * Added an option to have the server quit when done. 103 * 104 * Revision 1.3 2002/09/23 03:33:34 agmsmith 105 * First working version, with cutoff ratio and subject modification, 106 * and an attribute added if a patch is made to the Folder filter. 107 * 108 * Revision 1.2 2002/09/21 20:57:22 agmsmith 109 * Fixed bugs so now it compiles. 110 * 111 * Revision 1.1 2002/09/21 20:47:15 agmsmith 112 * Initial revision 113 */ 114 115#include <Beep.h> 116#include <Catalog.h> 117#include <fs_attr.h> 118#include <Messenger.h> 119#include <Node.h> 120#include <Path.h> 121#include <Roster.h> 122#include <String.h> 123#include <FindDirectory.h> 124#include <Entry.h> 125 126#include <stdlib.h> 127#include <stdio.h> 128 129#include "SpamFilter.h" 130 131 132#undef B_TRANSLATION_CONTEXT 133#define B_TRANSLATION_CONTEXT "SpamFilter" 134 135 136// The names match the ones set up by spamdbm for sound effects. 137static const char *kAGMSBayesBeepGenuineName = "SpamFilter-Genuine"; 138static const char *kAGMSBayesBeepSpamName = "SpamFilter-Spam"; 139static const char *kAGMSBayesBeepUncertainName = "SpamFilter-Uncertain"; 140 141static const char *kServerSignature = "application/x-vnd.agmsmith.spamdbm"; 142 143 144AGMSBayesianSpamFilter::AGMSBayesianSpamFilter(MailProtocol& protocol, 145 AddonSettings* addonSettings) 146 : 147 MailFilter(protocol, addonSettings), 148 149 fAddSpamToSubject(false), 150 fAutoTraining(true), 151 fGenuineCutoffRatio(0.01f), 152 fHeaderOnly(false), 153 fLaunchAttemptCount(0), 154 fNoWordsMeansSpam(true), 155 fQuitServerWhenFinished(false), 156 fSpamCutoffRatio(0.99f) 157{ 158 bool tempBool; 159 float tempFloat; 160 BMessenger tempMessenger; 161 162 const BMessage* settings = &addonSettings->Settings(); 163 if (settings != NULL) { 164 if (settings->FindBool ("AddMarkerToSubject", &tempBool) == B_OK) 165 fAddSpamToSubject = tempBool; 166 if (settings->FindBool ("AutoTraining", &tempBool) == B_OK) 167 fAutoTraining = tempBool; 168 if (settings->FindFloat ("GenuineCutoffRatio", &tempFloat) == B_OK) 169 fGenuineCutoffRatio = tempFloat; 170 if (settings->FindBool ("NoWordsMeansSpam", &tempBool) == B_OK) 171 fNoWordsMeansSpam = tempBool; 172 if (settings->FindBool ("QuitServerWhenFinished", &tempBool) == B_OK) 173 fQuitServerWhenFinished = tempBool; 174 if (settings->FindFloat ("SpamCutoffRatio", &tempFloat) == B_OK) 175 fSpamCutoffRatio = tempFloat; 176 } 177} 178 179 180AGMSBayesianSpamFilter::~AGMSBayesianSpamFilter () 181{ 182 if (fQuitServerWhenFinished && fMessengerToServer.IsValid ()) 183 fMessengerToServer.SendMessage(B_QUIT_REQUESTED); 184} 185 186 187void 188AGMSBayesianSpamFilter::HeaderFetched(const entry_ref& ref, BFile* file) 189{ 190 _CheckForSpam(file); 191} 192 193 194void 195AGMSBayesianSpamFilter::BodyFetched(const entry_ref& ref, BFile* file) 196{ 197 if (fHeaderOnly) 198 return; 199 200 // See if the message has already been classified. Happens for messages 201 // which are partially downloaded when you have auto-training on. Could 202 // untrain the partial part before training on the complete message, but we 203 // don't know how big it was, so instead just ignore the message. 204 attr_info attributeInfo; 205 if (file->GetAttrInfo ("MAIL:classification", &attributeInfo) == B_OK) 206 return; 207 208 _CheckForSpam(file); 209} 210 211 212status_t 213AGMSBayesianSpamFilter::_CheckForSpam(BFile* file) 214{ 215 // Get a connection to the spam database server. Launch if needed, should 216 // only need it once, unless another e-mail thread shuts down the server 217 // inbetween messages. This code used to be in InitCheck, but apparently 218 // that isn't called. 219 printf("Checking for Spam Server.\n"); 220 if (fLaunchAttemptCount == 0 || !fMessengerToServer.IsValid ()) { 221 if (_GetTokenizeMode() != B_OK) 222 return B_ERROR; 223 } 224 225 off_t dataSize; 226 file->GetSize(&dataSize); 227 char* stringBuffer = new char[dataSize + 1]; 228 file->Read(stringBuffer, dataSize); 229 stringBuffer[dataSize] = 0; // Add an end of string NUL, just in case. 230 231 float spamRatio; 232 if (_GetSpamRatio(stringBuffer, dataSize, spamRatio) != B_OK) 233 return B_ERROR; 234 235 // If we are auto-training, feed back the message to the server as a 236 // training example (don't train if it is uncertain). 237 if (fAutoTraining && (spamRatio >= fSpamCutoffRatio 238 || spamRatio < fGenuineCutoffRatio)) { 239 _TrainServer(stringBuffer, dataSize, spamRatio); 240 } 241 242 delete[] stringBuffer; 243 244 // write attributes 245 const char *classificationString; 246 classificationString = (spamRatio >= fSpamCutoffRatio) ? "Spam" 247 : ((spamRatio < fGenuineCutoffRatio) ? "Genuine" : "Uncertain"); 248 file->WriteAttr("MAIL:classification", B_STRING_TYPE, 0 /* offset */, 249 classificationString, strlen(classificationString) + 1); 250 251 // Store the spam ratio in an attribute called MAIL:ratio_spam, 252 // attached to the eventual output file. 253 file->WriteAttr("MAIL:ratio_spam", B_FLOAT_TYPE, 0 /* offset */, &spamRatio, 254 sizeof(spamRatio)); 255 256 // Also add it to the subject, if requested. 257 if (fAddSpamToSubject && spamRatio >= fSpamCutoffRatio) 258 _AddSpamToSubject(file, spamRatio); 259 260 // Beep using different sounds for spam and genuine, as Jeremy Friesner 261 // nudged me to get around to implementing. And add uncertain to that, as 262 // "BiPolar" suggested. If the user doesn't want to hear the sound, they 263 // can turn it off in the system sound preferences. 264 265 if (spamRatio >= fSpamCutoffRatio) { 266 system_beep(kAGMSBayesBeepSpamName); 267 } else if (spamRatio < fGenuineCutoffRatio) { 268 system_beep(kAGMSBayesBeepGenuineName); 269 } else { 270 system_beep(kAGMSBayesBeepUncertainName); 271 } 272 273 return B_OK; 274} 275 276 277status_t 278AGMSBayesianSpamFilter::_CheckForSpamServer() 279{ 280 // Make sure the server is running. 281 if (be_roster->IsRunning (kServerSignature)) 282 return B_OK; 283 284 status_t errorCode = be_roster->Launch (kServerSignature); 285 if (errorCode == B_OK) 286 return errorCode; 287 288 BPath path; 289 entry_ref ref; 290 directory_which places[] = {B_COMMON_BIN_DIRECTORY,B_BEOS_BIN_DIRECTORY}; 291 for (int32 i = 0; i < 2; i++) { 292 find_directory(places[i],&path); 293 path.Append("spamdbm"); 294 if (!BEntry(path.Path()).Exists()) 295 continue; 296 get_ref_for_path(path.Path(),&ref); 297 if ((errorCode = be_roster->Launch(&ref)) == B_OK) 298 break; 299 } 300 301 return errorCode; 302} 303 304 305status_t 306AGMSBayesianSpamFilter::_GetTokenizeMode() 307{ 308 if (fLaunchAttemptCount > 3) 309 return B_ERROR; // Don't try to start the server too many times. 310 fLaunchAttemptCount++; 311 312 // Make sure the server is running. 313 status_t errorCode = _CheckForSpamServer(); 314 if (errorCode != B_OK) 315 return errorCode; 316 317 // Set up the messenger to the database server. 318 fMessengerToServer = BMessenger(kServerSignature); 319 if (!fMessengerToServer.IsValid ()) 320 return B_ERROR; 321 322 // Check if the server is running in headers only mode. If so, we only 323 // need to download the header rather than the entire message. 324 BMessage scriptingMessage(B_GET_PROPERTY); 325 scriptingMessage.AddSpecifier("TokenizeMode"); 326 BMessage replyMessage; 327 if ((errorCode = fMessengerToServer.SendMessage (&scriptingMessage, 328 &replyMessage)) != B_OK) 329 return errorCode; 330 status_t tempErrorCode; 331 if ((errorCode = replyMessage.FindInt32 ("error", &tempErrorCode)) 332 != B_OK) 333 return errorCode; 334 if ((errorCode = tempErrorCode) != B_OK) 335 return errorCode; 336 337 const char *tokenizeModeStringPntr; 338 if ((errorCode = replyMessage.FindString ("result", 339 &tokenizeModeStringPntr)) != B_OK) 340 return errorCode; 341 fHeaderOnly = (tokenizeModeStringPntr != NULL 342 && strcmp (tokenizeModeStringPntr, "JustHeader") == 0); 343 return B_OK; 344} 345 346 347status_t 348AGMSBayesianSpamFilter::_GetSpamRatio(const char* stringBuffer, off_t dataSize, 349 float& ratio) 350{ 351 // Send off a scripting command to the database server, asking it to 352 // evaluate the string for spaminess. Note that it can return ENOMSG 353 // when there are no words (a good indicator of spam which is pure HTML 354 // if you are using plain text only tokenization), so we could use that 355 // as a spam marker too. Code copied for the reevaluate stuff below. 356 357 BMessage scriptingMessage(B_SET_PROPERTY); 358 scriptingMessage.AddSpecifier("EvaluateString"); 359 status_t errorCode = scriptingMessage.AddData("data", B_STRING_TYPE, 360 stringBuffer, dataSize + 1, false /* fixed size */); 361 if (errorCode != B_OK) 362 return errorCode; 363 BMessage replyMessage; 364 errorCode = fMessengerToServer.SendMessage(&scriptingMessage, 365 &replyMessage); 366 if (errorCode != B_OK 367 || replyMessage.FindInt32("error", &errorCode) != B_OK) 368 return errorCode; // Unable to read the return code. 369 if (errorCode == ENOMSG && fNoWordsMeansSpam) 370 ratio = fSpamCutoffRatio; // Yes, no words and that means spam. 371 else if (errorCode != B_OK 372 || replyMessage.FindFloat("result", &ratio) != B_OK) 373 return errorCode; // Classification failed in one of many ways. 374 375 return errorCode; 376} 377 378 379status_t 380AGMSBayesianSpamFilter::_TrainServer(const char* stringBuffer, off_t dataSize, 381 float spamRatio) 382{ 383 BMessage scriptingMessage(B_SET_PROPERTY); 384 scriptingMessage.AddSpecifier((spamRatio >= fSpamCutoffRatio) 385 ? "SpamString" : "GenuineString"); 386 status_t errorCode = scriptingMessage.AddData ("data", B_STRING_TYPE, 387 stringBuffer, dataSize + 1, false /* fixed size */); 388 if (errorCode != B_OK) 389 return errorCode; 390 BMessage replyMessage; 391 errorCode = fMessengerToServer.SendMessage (&scriptingMessage, 392 &replyMessage); 393 if (errorCode != B_OK) 394 return errorCode; 395 errorCode = replyMessage.FindInt32("error", &errorCode); 396 397 return errorCode; 398} 399 400 401status_t 402AGMSBayesianSpamFilter::_AddSpamToSubject(BNode* file, float spamRatio) 403{ 404 attr_info info; 405 if (file->GetAttrInfo("Subject", &info) != B_OK) 406 return B_ERROR; 407 if (info.type != B_STRING_TYPE) 408 return B_ERROR; 409 410 char* buffer = new char[info.size]; 411 if (file->ReadAttr("Subject", B_STRING_TYPE, 0, buffer, info.size) < 0) { 412 delete[] buffer; 413 return B_ERROR; 414 } 415 416 BString newSubjectString; 417 newSubjectString.SetTo("[Spam "); 418 char percentageString[30]; 419 sprintf(percentageString, "%05.2f", spamRatio * 100.0); 420 newSubjectString << percentageString << "%] "; 421 newSubjectString << buffer; 422 delete[] buffer; 423 424 if (file->WriteAttr("Subject", B_STRING_TYPE, 0, newSubjectString.String(), 425 newSubjectString.Length()) < 0) 426 return B_ERROR; 427 428 return B_OK; 429} 430 431 432BString 433descriptive_name() 434{ 435 return B_TRANSLATE("Spam Filter (AGMS Bayesian)"); 436} 437 438 439MailFilter* 440instantiate_mailfilter(MailProtocol& protocol, AddonSettings* settings) 441{ 442 return new AGMSBayesianSpamFilter(protocol, settings); 443} 444