1/*
2 * Copyright 2002-2011, Haiku, Inc. All rights reserved.
3 * Copyright 2002 Alexander G. M. Smith.
4 * Copyright 2011, Clemens Zeidler <haiku@clemens-zeidler.de>
5 * Distributed under the terms of the MIT License.
6 */
7/******************************************************************************
8 * $Id: SpamFilter.cpp 29284 2009-02-22 13:45:40Z bga $
9 *
10 * SpamFilter - Uses Bayesian statistics to evaluate the spaminess of a
11 * message.  The evaluation is done by a separate server, this add-on just gets
12 * the text and uses scripting commands to get an evaluation from the server.
13 * If the server isn't running, it will be found and started up.  Once the
14 * evaluation has been received, it is added to the message as an attribute and
15 * optionally as an addition to the subject.  Some other add-on later in the
16 * pipeline will use the attribute to delete the message or move it to some
17 * other folder.
18 *
19 * Public Domain 2002, by Alexander G. M. Smith, no warranty.
20 *
21 * $Log: SpamFilter.cpp,v $ (SVN doesn't support log messages so manually done)
22 * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
23 * Move trunk into respective module.
24 *
25 * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
26 * Added AGMS's excellent spam detection software.  Still some weirdness with
27 * the configuration interface from E-mail prefs.
28 *
29 * r9669 | brunoga | 2004-10-30 18:23:26 -0400 (Sat, 30 Oct 2004) | 2 lines
30 * AGMS Spam Filter.
31 *
32 * Revision 1.19  2004/09/20 15:57:30  nwhitehorn
33 * Mostly updated the tree to Be/Haiku style identifier naming conventions. I have a few more things to work out, mostly in mail_util.h, and then I'm proceeding to jamify the build system. Then we go into Haiku CVS.
34 *
35 * Revision 1.18  2003/09/20 12:39:27  agmsmith
36 * Memory leak delete needs [] bug.
37 *
38 * Revision 1.17  2003/07/08 21:12:47  agmsmith
39 * Changed other spam filter defaults to values I find useful.
40 *
41 * Revision 1.16  2003/07/08 20:56:40  agmsmith
42 * Turn on auto-training for the spam filter by default.
43 *
44 * Revision 1.15  2003/07/06 13:30:33  agmsmith
45 * Make sure that the spam filter doesn't auto-train the message twice
46 * when it gets a partially downloaded e-mail (will just train on the
47 * partial one, ignore the complete message when it gets downloaded).
48 *
49 * Revision 1.14  2003/05/27 17:12:59  nwhitehorn
50 * Massive refactoring of the Protocol/ChainRunner/Filter system. You can probably
51 * examine its scope by examining the number of files changed. Regardless, this is
52 * preparation for lots of new features, and REAL WORKING IMAP. Yes, you heard me.
53 * Enjoy, and prepare for bugs (although I've fixed all the ones I've found, I susp
54 * ect there are some memory leaks in ChainRunner).
55 *
56 * Revision 1.13  2003/02/08 21:54:17  agmsmith
57 * Updated the AGMSBayesianSpamServer documentation to match the current
58 * version.  Also removed the Beep options from the spam filter, now they
59 * are turned on or off in the system sound preferences.
60 *
61 * Revision 1.12  2002/12/18 02:27:45  agmsmith
62 * Added uncertain classification as suggested by BiPolar.
63 *
64 * Revision 1.11  2002/12/16 16:03:20  agmsmith
65 * Changed spam cutoff to 0.95 to work with default Chi-Squared scoring.
66 *
67 * Revision 1.10  2002/12/13 22:04:42  agmsmith
68 * Changed default to turn on the Spam marker in the subject.
69 *
70 * Revision 1.9  2002/12/13 20:27:44  agmsmith
71 * Added auto-training mode to the filter.  It evaluates a message for
72 * spaminess then recursively adds it to the database.  This can lead
73 * to weird results unless the user corrects the bad classifications.
74 *
75 * Revision 1.8  2002/11/28 20:20:57  agmsmith
76 * Now checks if the spam database is running in headers only mode, and
77 * then only downloads headers if that is the case.
78 *
79 * Revision 1.7  2002/11/10 19:36:26  agmsmith
80 * Retry launching server a few times, but not too many.
81 *
82 * Revision 1.6  2002/11/03 02:21:02  agmsmith
83 * Never mind, just use the SourceForge version numbers.  Ugh.
84 *
85 * Revision 1.8  2002/10/21 16:12:09  agmsmith
86 * Added option for spam if no words found, use new method of saving
87 * the attribute which avoids hacking the rest of the mail system.
88 *
89 * Revision 1.7  2002/10/11 20:01:28  agmsmith
90 * Added sound effects (system beep) for genuine and spam, plus config option
91 * for it.
92 *
93 * Revision 1.6  2002/10/01 00:45:34  agmsmith
94 * Changed default spam ratio to 0.56 from 0.9, for use with
95 * the Gary Robinson method in AGMSBayesianSpamServer 1.49.
96 *
97 * Revision 1.5  2002/09/25 13:23:21  agmsmith
98 * Don't leave the data stream at the initial position, try leaving it
99 * at the end.  Was having mail progress bar problems.
100 *
101 * Revision 1.4  2002/09/23 19:14:13  agmsmith
102 * Added an option to have the server quit when done.
103 *
104 * Revision 1.3  2002/09/23 03:33:34  agmsmith
105 * First working version, with cutoff ratio and subject modification,
106 * and an attribute added if a patch is made to the Folder filter.
107 *
108 * Revision 1.2  2002/09/21 20:57:22  agmsmith
109 * Fixed bugs so now it compiles.
110 *
111 * Revision 1.1  2002/09/21 20:47:15  agmsmith
112 * Initial revision
113 */
114
115#include <Beep.h>
116#include <Catalog.h>
117#include <fs_attr.h>
118#include <Messenger.h>
119#include <Node.h>
120#include <Path.h>
121#include <Roster.h>
122#include <String.h>
123#include <FindDirectory.h>
124#include <Entry.h>
125
126#include <stdlib.h>
127#include <stdio.h>
128
129#include "SpamFilter.h"
130
131
132#undef B_TRANSLATION_CONTEXT
133#define B_TRANSLATION_CONTEXT "SpamFilter"
134
135
136// The names match the ones set up by spamdbm for sound effects.
137static const char *kAGMSBayesBeepGenuineName = "SpamFilter-Genuine";
138static const char *kAGMSBayesBeepSpamName = "SpamFilter-Spam";
139static const char *kAGMSBayesBeepUncertainName = "SpamFilter-Uncertain";
140
141static const char *kServerSignature = "application/x-vnd.agmsmith.spamdbm";
142
143
144AGMSBayesianSpamFilter::AGMSBayesianSpamFilter(MailProtocol& protocol,
145	AddonSettings* addonSettings)
146	:
147	MailFilter(protocol, addonSettings),
148
149	fAddSpamToSubject(false),
150	fAutoTraining(true),
151	fGenuineCutoffRatio(0.01f),
152	fHeaderOnly(false),
153	fLaunchAttemptCount(0),
154	fNoWordsMeansSpam(true),
155	fQuitServerWhenFinished(false),
156	fSpamCutoffRatio(0.99f)
157{
158	bool		tempBool;
159	float		tempFloat;
160	BMessenger	tempMessenger;
161
162	const BMessage* settings = &addonSettings->Settings();
163	if (settings != NULL) {
164		if (settings->FindBool ("AddMarkerToSubject", &tempBool) == B_OK)
165			fAddSpamToSubject = tempBool;
166		if (settings->FindBool ("AutoTraining", &tempBool) == B_OK)
167			fAutoTraining = tempBool;
168		if (settings->FindFloat ("GenuineCutoffRatio", &tempFloat) == B_OK)
169			fGenuineCutoffRatio = tempFloat;
170		if (settings->FindBool ("NoWordsMeansSpam", &tempBool) == B_OK)
171			fNoWordsMeansSpam = tempBool;
172		if (settings->FindBool ("QuitServerWhenFinished", &tempBool) == B_OK)
173			fQuitServerWhenFinished = tempBool;
174		if (settings->FindFloat ("SpamCutoffRatio", &tempFloat) == B_OK)
175			fSpamCutoffRatio = tempFloat;
176	}
177}
178
179
180AGMSBayesianSpamFilter::~AGMSBayesianSpamFilter ()
181{
182	if (fQuitServerWhenFinished && fMessengerToServer.IsValid ())
183		fMessengerToServer.SendMessage(B_QUIT_REQUESTED);
184}
185
186
187void
188AGMSBayesianSpamFilter::HeaderFetched(const entry_ref& ref, BFile* file)
189{
190	_CheckForSpam(file);
191}
192
193
194void
195AGMSBayesianSpamFilter::BodyFetched(const entry_ref& ref, BFile* file)
196{
197	if (fHeaderOnly)
198		return;
199
200	// See if the message has already been classified.  Happens for messages
201	// which are partially downloaded when you have auto-training on.  Could
202	// untrain the partial part before training on the complete message, but we
203	// don't know how big it was, so instead just ignore the message.
204	attr_info attributeInfo;
205	if (file->GetAttrInfo ("MAIL:classification", &attributeInfo) == B_OK)
206		return;
207
208	_CheckForSpam(file);
209}
210
211
212status_t
213AGMSBayesianSpamFilter::_CheckForSpam(BFile* file)
214{
215	// Get a connection to the spam database server.  Launch if needed, should
216	// only need it once, unless another e-mail thread shuts down the server
217	// inbetween messages.  This code used to be in InitCheck, but apparently
218	// that isn't called.
219	printf("Checking for Spam Server.\n");
220	if (fLaunchAttemptCount == 0 || !fMessengerToServer.IsValid ()) {
221		if (_GetTokenizeMode() != B_OK)
222			return B_ERROR;
223	}
224
225	off_t dataSize;
226	file->GetSize(&dataSize);
227	char* stringBuffer = new char[dataSize + 1];
228	file->Read(stringBuffer, dataSize);
229	stringBuffer[dataSize] = 0; // Add an end of string NUL, just in case.
230
231	float spamRatio;
232	if (_GetSpamRatio(stringBuffer, dataSize, spamRatio) != B_OK)
233		return B_ERROR;
234
235	// If we are auto-training, feed back the message to the server as a
236	// training example (don't train if it is uncertain).
237	if (fAutoTraining && (spamRatio >= fSpamCutoffRatio
238		|| spamRatio < fGenuineCutoffRatio)) {
239			_TrainServer(stringBuffer, dataSize, spamRatio);
240	}
241
242	delete[] stringBuffer;
243
244	// write attributes
245	const char *classificationString;
246	classificationString = (spamRatio >= fSpamCutoffRatio) ? "Spam"
247		: ((spamRatio < fGenuineCutoffRatio) ? "Genuine" : "Uncertain");
248	file->WriteAttr("MAIL:classification", B_STRING_TYPE, 0 /* offset */,
249		classificationString, strlen(classificationString) + 1);
250
251	// Store the spam ratio in an attribute called MAIL:ratio_spam,
252	// attached to the eventual output file.
253	file->WriteAttr("MAIL:ratio_spam", B_FLOAT_TYPE, 0 /* offset */, &spamRatio,
254		sizeof(spamRatio));
255
256	// Also add it to the subject, if requested.
257	if (fAddSpamToSubject && spamRatio >= fSpamCutoffRatio)
258		_AddSpamToSubject(file, spamRatio);
259
260	// Beep using different sounds for spam and genuine, as Jeremy Friesner
261	// nudged me to get around to implementing.  And add uncertain to that, as
262	// "BiPolar" suggested.  If the user doesn't want to hear the sound, they
263	// can turn it off in the system sound preferences.
264
265	if (spamRatio >= fSpamCutoffRatio) {
266		system_beep(kAGMSBayesBeepSpamName);
267	} else if (spamRatio < fGenuineCutoffRatio) {
268		system_beep(kAGMSBayesBeepGenuineName);
269	} else {
270		system_beep(kAGMSBayesBeepUncertainName);
271	}
272
273	return B_OK;
274}
275
276
277status_t
278AGMSBayesianSpamFilter::_CheckForSpamServer()
279{
280	// Make sure the server is running.
281	if (be_roster->IsRunning (kServerSignature))
282		return B_OK;
283
284	status_t errorCode = be_roster->Launch (kServerSignature);
285	if (errorCode == B_OK)
286		return errorCode;
287
288	BPath path;
289	entry_ref ref;
290	directory_which places[] = {B_COMMON_BIN_DIRECTORY,B_BEOS_BIN_DIRECTORY};
291	for (int32 i = 0; i < 2; i++) {
292		find_directory(places[i],&path);
293		path.Append("spamdbm");
294		if (!BEntry(path.Path()).Exists())
295			continue;
296		get_ref_for_path(path.Path(),&ref);
297		if ((errorCode =  be_roster->Launch(&ref)) == B_OK)
298			break;
299	}
300
301	return errorCode;
302}
303
304
305status_t
306AGMSBayesianSpamFilter::_GetTokenizeMode()
307{
308	if (fLaunchAttemptCount > 3)
309		return B_ERROR; // Don't try to start the server too many times.
310	fLaunchAttemptCount++;
311
312	// Make sure the server is running.
313	status_t errorCode = _CheckForSpamServer();
314	if (errorCode != B_OK)
315		return errorCode;
316
317	// Set up the messenger to the database server.
318	fMessengerToServer = BMessenger(kServerSignature);
319	if (!fMessengerToServer.IsValid ())
320		return B_ERROR;
321
322	// Check if the server is running in headers only mode.  If so, we only
323	// need to download the header rather than the entire message.
324	BMessage scriptingMessage(B_GET_PROPERTY);
325	scriptingMessage.AddSpecifier("TokenizeMode");
326	BMessage replyMessage;
327	if ((errorCode = fMessengerToServer.SendMessage (&scriptingMessage,
328		&replyMessage)) != B_OK)
329		return errorCode;
330	status_t tempErrorCode;
331	if ((errorCode = replyMessage.FindInt32 ("error", &tempErrorCode))
332		!= B_OK)
333		return errorCode;
334	if ((errorCode = tempErrorCode) != B_OK)
335		return errorCode;
336
337	const char  *tokenizeModeStringPntr;
338	if ((errorCode = replyMessage.FindString ("result",
339		&tokenizeModeStringPntr)) != B_OK)
340		return errorCode;
341	fHeaderOnly = (tokenizeModeStringPntr != NULL
342		&& strcmp (tokenizeModeStringPntr, "JustHeader") == 0);
343	return B_OK;
344}
345
346
347status_t
348AGMSBayesianSpamFilter::_GetSpamRatio(const char* stringBuffer, off_t dataSize,
349	float& ratio)
350{
351	// Send off a scripting command to the database server, asking it to
352	// evaluate the string for spaminess.  Note that it can return ENOMSG
353	// when there are no words (a good indicator of spam which is pure HTML
354	// if you are using plain text only tokenization), so we could use that
355	// as a spam marker too.  Code copied for the reevaluate stuff below.
356
357	BMessage scriptingMessage(B_SET_PROPERTY);
358	scriptingMessage.AddSpecifier("EvaluateString");
359	status_t errorCode = scriptingMessage.AddData("data", B_STRING_TYPE,
360		stringBuffer, dataSize + 1, false /* fixed size */);
361	if (errorCode != B_OK)
362		return errorCode;
363	BMessage replyMessage;
364	errorCode = fMessengerToServer.SendMessage(&scriptingMessage,
365		&replyMessage);
366	if (errorCode != B_OK
367		|| replyMessage.FindInt32("error", &errorCode) != B_OK)
368		return errorCode; // Unable to read the return code.
369	if (errorCode == ENOMSG && fNoWordsMeansSpam)
370		ratio = fSpamCutoffRatio; // Yes, no words and that means spam.
371	else if (errorCode != B_OK
372		|| replyMessage.FindFloat("result", &ratio) != B_OK)
373		return errorCode; // Classification failed in one of many ways.
374
375	return errorCode;
376}
377
378
379status_t
380AGMSBayesianSpamFilter::_TrainServer(const char* stringBuffer, off_t dataSize,
381	float spamRatio)
382{
383	BMessage scriptingMessage(B_SET_PROPERTY);
384	scriptingMessage.AddSpecifier((spamRatio >= fSpamCutoffRatio)
385		? "SpamString" : "GenuineString");
386	status_t errorCode = scriptingMessage.AddData ("data", B_STRING_TYPE,
387		stringBuffer, dataSize + 1, false /* fixed size */);
388	if (errorCode != B_OK)
389		return errorCode;
390	BMessage replyMessage;
391	errorCode = fMessengerToServer.SendMessage (&scriptingMessage,
392		&replyMessage);
393	if (errorCode != B_OK)
394		return errorCode;
395	errorCode = replyMessage.FindInt32("error", &errorCode);
396
397	return errorCode;
398}
399
400
401status_t
402AGMSBayesianSpamFilter::_AddSpamToSubject(BNode* file, float spamRatio)
403{
404	attr_info info;
405	if (file->GetAttrInfo("Subject", &info) != B_OK)
406		return B_ERROR;
407	if (info.type != B_STRING_TYPE)
408		return B_ERROR;
409
410	char* buffer = new char[info.size];
411	if (file->ReadAttr("Subject", B_STRING_TYPE, 0, buffer, info.size) < 0) {
412		delete[] buffer;
413		return B_ERROR;
414	}
415
416	BString newSubjectString;
417	newSubjectString.SetTo("[Spam ");
418	char percentageString[30];
419	sprintf(percentageString, "%05.2f", spamRatio * 100.0);
420	newSubjectString << percentageString << "%] ";
421	newSubjectString << buffer;
422	delete[] buffer;
423
424	if (file->WriteAttr("Subject", B_STRING_TYPE, 0, newSubjectString.String(),
425		newSubjectString.Length()) < 0)
426		return B_ERROR;
427
428	return B_OK;
429}
430
431
432BString
433descriptive_name()
434{
435	return B_TRANSLATE("Spam Filter (AGMS Bayesian)");
436}
437
438
439MailFilter*
440instantiate_mailfilter(MailProtocol& protocol, AddonSettings* settings)
441{
442	return new AGMSBayesianSpamFilter(protocol, settings);
443}
444