1/* 2 * Copyright 2002-2013, Haiku, Inc. All rights reserved. 3 * Copyright 2002 Alexander G. M. Smith. 4 * Copyright 2011, Clemens Zeidler <haiku@clemens-zeidler.de> 5 * Distributed under the terms of the MIT License. 6 */ 7 8/*! Uses Bayesian statistics to evaluate the spaminess of a message. 9 The evaluation is done by a separate server, this add-on just gets 10 the text and uses scripting commands to get an evaluation from the server. 11 If the server isn't running, it will be found and started up. Once the 12 evaluation has been received, it is added to the message as an attribute and 13 optionally as an addition to the subject. Some other add-on later in the 14 pipeline will use the attribute to delete the message or move it to some 15 other folder. 16*/ 17 18 19#include "SpamFilter.h" 20 21#include <stdlib.h> 22#include <stdio.h> 23 24#include <Beep.h> 25#include <Catalog.h> 26#include <fs_attr.h> 27#include <Messenger.h> 28#include <Node.h> 29#include <Path.h> 30#include <Roster.h> 31#include <String.h> 32#include <FindDirectory.h> 33#include <Entry.h> 34 35 36#undef B_TRANSLATION_CONTEXT 37#define B_TRANSLATION_CONTEXT "SpamFilter" 38 39 40// The names match the ones set up by spamdbm for sound effects. 41static const char* kAGMSBayesBeepGenuineName = "SpamFilter-Genuine"; 42static const char* kAGMSBayesBeepSpamName = "SpamFilter-Spam"; 43static const char* kAGMSBayesBeepUncertainName = "SpamFilter-Uncertain"; 44 45static const char* kServerSignature = "application/x-vnd.agmsmith.spamdbm"; 46 47 48SpamFilter::SpamFilter(BMailProtocol& protocol, 49 const BMailAddOnSettings& settings) 50 : 51 BMailFilter(protocol, &settings) 52{ 53 fAddSpamToSubject = settings.GetBool("AddMarkerToSubject", false); 54 fAutoTraining = settings.GetBool("AutoTraining", true); 55 fGenuineCutoffRatio = settings.GetFloat("GenuineCutoffRatio", 0.01f); 56 fNoWordsMeansSpam = settings.GetBool("NoWordsMeansSpam", true); 57 fQuitServerWhenFinished = settings.GetBool("QuitServerWhenFinished", false); 58 fSpamCutoffRatio = settings.GetFloat("SpamCutoffRatio", 0.99f); 59} 60 61 62SpamFilter::~SpamFilter() 63{ 64 if (fQuitServerWhenFinished) 65 fMessengerToServer.SendMessage(B_QUIT_REQUESTED); 66} 67 68 69BMailFilterAction 70SpamFilter::HeaderFetched(entry_ref& ref, BFile& file, BMessage& attributes) 71{ 72 _CheckForSpam(file); 73 return B_NO_MAIL_ACTION; 74} 75 76 77void 78SpamFilter::BodyFetched(const entry_ref& ref, BFile& file, BMessage& attributes) 79{ 80 if (fHeaderOnly) 81 return; 82 83 // See if the message has already been classified. Happens for messages 84 // which are partially downloaded when you have auto-training on. Could 85 // untrain the partial part before training on the complete message, but we 86 // don't know how big it was, so instead just ignore the message. 87 attr_info attributeInfo; 88 if (file.GetAttrInfo("MAIL:classification", &attributeInfo) == B_OK) 89 return; 90 91 _CheckForSpam(file); 92} 93 94 95status_t 96SpamFilter::_CheckForSpam(BFile& file) 97{ 98 // Get a connection to the spam database server. Launch if needed, should 99 // only need it once, unless another e-mail thread shuts down the server 100 // inbetween messages. This code used to be in InitCheck, but apparently 101 // that isn't called. 102 printf("Checking for Spam Server.\n"); 103 if (fLaunchAttemptCount == 0 || !fMessengerToServer.IsValid()) { 104 if (_GetTokenizeMode() != B_OK) 105 return B_ERROR; 106 } 107 108 off_t dataSize; 109 file.GetSize(&dataSize); 110 char* stringBuffer = new char[dataSize + 1]; 111 file.Read(stringBuffer, dataSize); 112 stringBuffer[dataSize] = 0; // Add an end of string NUL, just in case. 113 114 float spamRatio; 115 if (_GetSpamRatio(stringBuffer, dataSize, spamRatio) != B_OK) 116 return B_ERROR; 117 118 // If we are auto-training, feed back the message to the server as a 119 // training example (don't train if it is uncertain). 120 if (fAutoTraining && (spamRatio >= fSpamCutoffRatio 121 || spamRatio < fGenuineCutoffRatio)) { 122 _TrainServer(stringBuffer, dataSize, spamRatio); 123 } 124 125 delete[] stringBuffer; 126 127 // write attributes 128 BString classificationString = spamRatio >= fSpamCutoffRatio ? "Spam" 129 : spamRatio < fGenuineCutoffRatio ? "Genuine" : "Uncertain"; 130 file.WriteAttrString("MAIL:classification", &classificationString); 131 132 // Store the spam ratio in an attribute called MAIL:ratio_spam, 133 // attached to the eventual output file. 134 file.WriteAttr("MAIL:ratio_spam", B_FLOAT_TYPE, 0 /* offset */, &spamRatio, 135 sizeof(spamRatio)); 136 137 // Also add it to the subject, if requested. 138 if (fAddSpamToSubject && spamRatio >= fSpamCutoffRatio) 139 _AddSpamToSubject(file, spamRatio); 140 141 // Beep using different sounds for spam and genuine, as Jeremy Friesner 142 // nudged me to get around to implementing. And add uncertain to that, as 143 // "BiPolar" suggested. If the user doesn't want to hear the sound, they 144 // can turn it off in the system sound preferences. 145 146 if (spamRatio >= fSpamCutoffRatio) 147 system_beep(kAGMSBayesBeepSpamName); 148 else if (spamRatio < fGenuineCutoffRatio) 149 system_beep(kAGMSBayesBeepGenuineName); 150 else 151 system_beep(kAGMSBayesBeepUncertainName); 152 153 return B_OK; 154} 155 156 157status_t 158SpamFilter::_CheckForSpamServer() 159{ 160 // Make sure the server is running. 161 if (be_roster->IsRunning (kServerSignature)) 162 return B_OK; 163 164 status_t status = be_roster->Launch (kServerSignature); 165 if (status == B_OK) 166 return status; 167 168 BPath path; 169 entry_ref ref; 170 const directory_which kPlaces[] = { 171 B_SYSTEM_NONPACKAGED_BIN_DIRECTORY, 172 B_SYSTEM_BIN_DIRECTORY}; 173 for (size_t i = 0; i < sizeof(kPlaces) / sizeof(kPlaces[0]); i++) { 174 find_directory(kPlaces[i], &path); 175 path.Append("spamdbm"); 176 if (!BEntry(path.Path()).Exists()) 177 continue; 178 get_ref_for_path(path.Path(), &ref); 179 if ((status = be_roster->Launch(&ref)) == B_OK) 180 break; 181 } 182 183 return status; 184} 185 186 187status_t 188SpamFilter::_GetTokenizeMode() 189{ 190 if (fLaunchAttemptCount > 3) 191 return B_ERROR; // Don't try to start the server too many times. 192 fLaunchAttemptCount++; 193 194 // Make sure the server is running. 195 status_t status = _CheckForSpamServer(); 196 if (status != B_OK) 197 return status; 198 199 // Set up the messenger to the database server. 200 fMessengerToServer = BMessenger(kServerSignature); 201 if (!fMessengerToServer.IsValid()) 202 return B_ERROR; 203 204 // Check if the server is running in headers only mode. If so, we only 205 // need to download the header rather than the entire message. 206 BMessage scriptingMessage(B_GET_PROPERTY); 207 scriptingMessage.AddSpecifier("TokenizeMode"); 208 BMessage replyMessage; 209 if ((status = fMessengerToServer.SendMessage(&scriptingMessage, 210 &replyMessage)) != B_OK) 211 return status; 212 status_t errorCode; 213 if ((status = replyMessage.FindInt32("error", &errorCode)) != B_OK) 214 return status; 215 if (errorCode != B_OK) 216 return errorCode; 217 218 const char* tokenizeMode; 219 if ((status = replyMessage.FindString("result", &tokenizeMode)) != B_OK) 220 return status; 221 222 fHeaderOnly = tokenizeMode != NULL && !strcmp(tokenizeMode, "JustHeader"); 223 return B_OK; 224} 225 226 227status_t 228SpamFilter::_GetSpamRatio(const char* stringBuffer, off_t dataSize, 229 float& ratio) 230{ 231 // Send off a scripting command to the database server, asking it to 232 // evaluate the string for spaminess. Note that it can return ENOMSG 233 // when there are no words (a good indicator of spam which is pure HTML 234 // if you are using plain text only tokenization), so we could use that 235 // as a spam marker too. Code copied for the reevaluate stuff below. 236 237 BMessage scriptingMessage(B_SET_PROPERTY); 238 scriptingMessage.AddSpecifier("EvaluateString"); 239 status_t errorCode = scriptingMessage.AddData("data", B_STRING_TYPE, 240 stringBuffer, dataSize + 1, false /* fixed size */); 241 if (errorCode != B_OK) 242 return errorCode; 243 BMessage replyMessage; 244 errorCode = fMessengerToServer.SendMessage(&scriptingMessage, 245 &replyMessage); 246 if (errorCode != B_OK 247 || replyMessage.FindInt32("error", &errorCode) != B_OK) 248 return errorCode; // Unable to read the return code. 249 if (errorCode == ENOMSG && fNoWordsMeansSpam) 250 ratio = fSpamCutoffRatio; // Yes, no words and that means spam. 251 else if (errorCode != B_OK 252 || replyMessage.FindFloat("result", &ratio) != B_OK) 253 return errorCode; // Classification failed in one of many ways. 254 255 return errorCode; 256} 257 258 259status_t 260SpamFilter::_TrainServer(const char* stringBuffer, off_t dataSize, 261 float spamRatio) 262{ 263 BMessage scriptingMessage(B_SET_PROPERTY); 264 scriptingMessage.AddSpecifier((spamRatio >= fSpamCutoffRatio) 265 ? "SpamString" : "GenuineString"); 266 status_t errorCode = scriptingMessage.AddData ("data", B_STRING_TYPE, 267 stringBuffer, dataSize + 1, false /* fixed size */); 268 if (errorCode != B_OK) 269 return errorCode; 270 BMessage replyMessage; 271 errorCode = fMessengerToServer.SendMessage (&scriptingMessage, 272 &replyMessage); 273 if (errorCode != B_OK) 274 return errorCode; 275 errorCode = replyMessage.FindInt32("error", &errorCode); 276 277 return errorCode; 278} 279 280 281status_t 282SpamFilter::_AddSpamToSubject(BNode& file, float spamRatio) 283{ 284 attr_info info; 285 if (file.GetAttrInfo("Subject", &info) != B_OK) 286 return B_ERROR; 287 if (info.type != B_STRING_TYPE) 288 return B_ERROR; 289 290 char* buffer = new char[info.size]; 291 if (file.ReadAttr("Subject", B_STRING_TYPE, 0, buffer, info.size) < 0) { 292 delete[] buffer; 293 return B_ERROR; 294 } 295 296 BString newSubjectString; 297 newSubjectString.SetTo("[Spam "); 298 char percentageString[30]; 299 sprintf(percentageString, "%05.2f", spamRatio * 100.0); 300 newSubjectString << percentageString << "%] "; 301 newSubjectString << buffer; 302 delete[] buffer; 303 304 if (file.WriteAttrString("Subject", &newSubjectString) < 0) 305 return B_ERROR; 306 307 return B_OK; 308} 309 310 311// #pragma mark - 312 313 314BString 315filter_name(const BMailAccountSettings& accountSettings, 316 const BMailAddOnSettings* addOnSettings) 317{ 318 return B_TRANSLATE("Bayesian Spam Filter"); 319} 320 321 322BMailFilter* 323instantiate_filter(BMailProtocol& protocol, const BMailAddOnSettings& settings) 324{ 325 return new SpamFilter(protocol, settings); 326} 327