1/*
2 * Copyright 2002-2013, Haiku, Inc. All rights reserved.
3 * Copyright 2002 Alexander G. M. Smith.
4 * Copyright 2011, Clemens Zeidler <haiku@clemens-zeidler.de>
5 * Distributed under the terms of the MIT License.
6 */
7
8/*!	Uses Bayesian statistics to evaluate the spaminess of a message.
9	The evaluation is done by a separate server, this add-on just gets
10	the text and uses scripting commands to get an evaluation from the server.
11	If the server isn't running, it will be found and started up.  Once the
12	evaluation has been received, it is added to the message as an attribute and
13	optionally as an addition to the subject.  Some other add-on later in the
14	pipeline will use the attribute to delete the message or move it to some
15	other folder.
16*/
17
18
19#include "SpamFilter.h"
20
21#include <stdlib.h>
22#include <stdio.h>
23
24#include <Beep.h>
25#include <Catalog.h>
26#include <fs_attr.h>
27#include <Messenger.h>
28#include <Node.h>
29#include <Path.h>
30#include <Roster.h>
31#include <String.h>
32#include <FindDirectory.h>
33#include <Entry.h>
34
35
36#undef B_TRANSLATION_CONTEXT
37#define B_TRANSLATION_CONTEXT "SpamFilter"
38
39
40// The names match the ones set up by spamdbm for sound effects.
41static const char* kAGMSBayesBeepGenuineName = "SpamFilter-Genuine";
42static const char* kAGMSBayesBeepSpamName = "SpamFilter-Spam";
43static const char* kAGMSBayesBeepUncertainName = "SpamFilter-Uncertain";
44
45static const char* kServerSignature = "application/x-vnd.agmsmith.spamdbm";
46
47
48SpamFilter::SpamFilter(BMailProtocol& protocol,
49	const BMailAddOnSettings& settings)
50	:
51	BMailFilter(protocol, &settings)
52{
53	fAddSpamToSubject = settings.GetBool("AddMarkerToSubject", false);
54	fAutoTraining = settings.GetBool("AutoTraining", true);
55	fGenuineCutoffRatio = settings.GetFloat("GenuineCutoffRatio", 0.01f);
56	fNoWordsMeansSpam = settings.GetBool("NoWordsMeansSpam", true);
57	fQuitServerWhenFinished = settings.GetBool("QuitServerWhenFinished", false);
58	fSpamCutoffRatio = settings.GetFloat("SpamCutoffRatio", 0.99f);
59}
60
61
62SpamFilter::~SpamFilter()
63{
64	if (fQuitServerWhenFinished)
65		fMessengerToServer.SendMessage(B_QUIT_REQUESTED);
66}
67
68
69BMailFilterAction
70SpamFilter::HeaderFetched(entry_ref& ref, BFile& file, BMessage& attributes)
71{
72	_CheckForSpam(file);
73	return B_NO_MAIL_ACTION;
74}
75
76
77void
78SpamFilter::BodyFetched(const entry_ref& ref, BFile& file, BMessage& attributes)
79{
80	if (fHeaderOnly)
81		return;
82
83	// See if the message has already been classified.  Happens for messages
84	// which are partially downloaded when you have auto-training on.  Could
85	// untrain the partial part before training on the complete message, but we
86	// don't know how big it was, so instead just ignore the message.
87	attr_info attributeInfo;
88	if (file.GetAttrInfo("MAIL:classification", &attributeInfo) == B_OK)
89		return;
90
91	_CheckForSpam(file);
92}
93
94
95status_t
96SpamFilter::_CheckForSpam(BFile& file)
97{
98	// Get a connection to the spam database server.  Launch if needed, should
99	// only need it once, unless another e-mail thread shuts down the server
100	// inbetween messages.  This code used to be in InitCheck, but apparently
101	// that isn't called.
102	printf("Checking for Spam Server.\n");
103	if (fLaunchAttemptCount == 0 || !fMessengerToServer.IsValid()) {
104		if (_GetTokenizeMode() != B_OK)
105			return B_ERROR;
106	}
107
108	off_t dataSize;
109	file.GetSize(&dataSize);
110	char* stringBuffer = new char[dataSize + 1];
111	file.Read(stringBuffer, dataSize);
112	stringBuffer[dataSize] = 0; // Add an end of string NUL, just in case.
113
114	float spamRatio;
115	if (_GetSpamRatio(stringBuffer, dataSize, spamRatio) != B_OK)
116		return B_ERROR;
117
118	// If we are auto-training, feed back the message to the server as a
119	// training example (don't train if it is uncertain).
120	if (fAutoTraining && (spamRatio >= fSpamCutoffRatio
121		|| spamRatio < fGenuineCutoffRatio)) {
122		_TrainServer(stringBuffer, dataSize, spamRatio);
123	}
124
125	delete[] stringBuffer;
126
127	// write attributes
128	BString classificationString = spamRatio >= fSpamCutoffRatio ? "Spam"
129		: spamRatio < fGenuineCutoffRatio ? "Genuine" : "Uncertain";
130	file.WriteAttrString("MAIL:classification", &classificationString);
131
132	// Store the spam ratio in an attribute called MAIL:ratio_spam,
133	// attached to the eventual output file.
134	file.WriteAttr("MAIL:ratio_spam", B_FLOAT_TYPE, 0 /* offset */, &spamRatio,
135		sizeof(spamRatio));
136
137	// Also add it to the subject, if requested.
138	if (fAddSpamToSubject && spamRatio >= fSpamCutoffRatio)
139		_AddSpamToSubject(file, spamRatio);
140
141	// Beep using different sounds for spam and genuine, as Jeremy Friesner
142	// nudged me to get around to implementing.  And add uncertain to that, as
143	// "BiPolar" suggested.  If the user doesn't want to hear the sound, they
144	// can turn it off in the system sound preferences.
145
146	if (spamRatio >= fSpamCutoffRatio)
147		system_beep(kAGMSBayesBeepSpamName);
148	else if (spamRatio < fGenuineCutoffRatio)
149		system_beep(kAGMSBayesBeepGenuineName);
150	else
151		system_beep(kAGMSBayesBeepUncertainName);
152
153	return B_OK;
154}
155
156
157status_t
158SpamFilter::_CheckForSpamServer()
159{
160	// Make sure the server is running.
161	if (be_roster->IsRunning (kServerSignature))
162		return B_OK;
163
164	status_t status = be_roster->Launch (kServerSignature);
165	if (status == B_OK)
166		return status;
167
168	BPath path;
169	entry_ref ref;
170	const directory_which kPlaces[] = {
171		B_SYSTEM_NONPACKAGED_BIN_DIRECTORY,
172		B_SYSTEM_BIN_DIRECTORY};
173	for (size_t i = 0; i < sizeof(kPlaces) / sizeof(kPlaces[0]); i++) {
174		find_directory(kPlaces[i], &path);
175		path.Append("spamdbm");
176		if (!BEntry(path.Path()).Exists())
177			continue;
178		get_ref_for_path(path.Path(), &ref);
179		if ((status = be_roster->Launch(&ref)) == B_OK)
180			break;
181	}
182
183	return status;
184}
185
186
187status_t
188SpamFilter::_GetTokenizeMode()
189{
190	if (fLaunchAttemptCount > 3)
191		return B_ERROR; // Don't try to start the server too many times.
192	fLaunchAttemptCount++;
193
194	// Make sure the server is running.
195	status_t status = _CheckForSpamServer();
196	if (status != B_OK)
197		return status;
198
199	// Set up the messenger to the database server.
200	fMessengerToServer = BMessenger(kServerSignature);
201	if (!fMessengerToServer.IsValid())
202		return B_ERROR;
203
204	// Check if the server is running in headers only mode.  If so, we only
205	// need to download the header rather than the entire message.
206	BMessage scriptingMessage(B_GET_PROPERTY);
207	scriptingMessage.AddSpecifier("TokenizeMode");
208	BMessage replyMessage;
209	if ((status = fMessengerToServer.SendMessage(&scriptingMessage,
210			&replyMessage)) != B_OK)
211		return status;
212	status_t errorCode;
213	if ((status = replyMessage.FindInt32("error", &errorCode)) != B_OK)
214		return status;
215	if (errorCode != B_OK)
216		return errorCode;
217
218	const char* tokenizeMode;
219	if ((status = replyMessage.FindString("result", &tokenizeMode)) != B_OK)
220		return status;
221
222	fHeaderOnly = tokenizeMode != NULL && !strcmp(tokenizeMode, "JustHeader");
223	return B_OK;
224}
225
226
227status_t
228SpamFilter::_GetSpamRatio(const char* stringBuffer, off_t dataSize,
229	float& ratio)
230{
231	// Send off a scripting command to the database server, asking it to
232	// evaluate the string for spaminess.  Note that it can return ENOMSG
233	// when there are no words (a good indicator of spam which is pure HTML
234	// if you are using plain text only tokenization), so we could use that
235	// as a spam marker too.  Code copied for the reevaluate stuff below.
236
237	BMessage scriptingMessage(B_SET_PROPERTY);
238	scriptingMessage.AddSpecifier("EvaluateString");
239	status_t errorCode = scriptingMessage.AddData("data", B_STRING_TYPE,
240		stringBuffer, dataSize + 1, false /* fixed size */);
241	if (errorCode != B_OK)
242		return errorCode;
243	BMessage replyMessage;
244	errorCode = fMessengerToServer.SendMessage(&scriptingMessage,
245		&replyMessage);
246	if (errorCode != B_OK
247		|| replyMessage.FindInt32("error", &errorCode) != B_OK)
248		return errorCode; // Unable to read the return code.
249	if (errorCode == ENOMSG && fNoWordsMeansSpam)
250		ratio = fSpamCutoffRatio; // Yes, no words and that means spam.
251	else if (errorCode != B_OK
252		|| replyMessage.FindFloat("result", &ratio) != B_OK)
253		return errorCode; // Classification failed in one of many ways.
254
255	return errorCode;
256}
257
258
259status_t
260SpamFilter::_TrainServer(const char* stringBuffer, off_t dataSize,
261	float spamRatio)
262{
263	BMessage scriptingMessage(B_SET_PROPERTY);
264	scriptingMessage.AddSpecifier((spamRatio >= fSpamCutoffRatio)
265		? "SpamString" : "GenuineString");
266	status_t errorCode = scriptingMessage.AddData ("data", B_STRING_TYPE,
267		stringBuffer, dataSize + 1, false /* fixed size */);
268	if (errorCode != B_OK)
269		return errorCode;
270	BMessage replyMessage;
271	errorCode = fMessengerToServer.SendMessage (&scriptingMessage,
272		&replyMessage);
273	if (errorCode != B_OK)
274		return errorCode;
275	errorCode = replyMessage.FindInt32("error", &errorCode);
276
277	return errorCode;
278}
279
280
281status_t
282SpamFilter::_AddSpamToSubject(BNode& file, float spamRatio)
283{
284	attr_info info;
285	if (file.GetAttrInfo("Subject", &info) != B_OK)
286		return B_ERROR;
287	if (info.type != B_STRING_TYPE)
288		return B_ERROR;
289
290	char* buffer = new char[info.size];
291	if (file.ReadAttr("Subject", B_STRING_TYPE, 0, buffer, info.size) < 0) {
292		delete[] buffer;
293		return B_ERROR;
294	}
295
296	BString newSubjectString;
297	newSubjectString.SetTo("[Spam ");
298	char percentageString[30];
299	sprintf(percentageString, "%05.2f", spamRatio * 100.0);
300	newSubjectString << percentageString << "%] ";
301	newSubjectString << buffer;
302	delete[] buffer;
303
304	if (file.WriteAttrString("Subject", &newSubjectString) < 0)
305		return B_ERROR;
306
307	return B_OK;
308}
309
310
311// #pragma mark -
312
313
314BString
315filter_name(const BMailAccountSettings& accountSettings,
316	const BMailAddOnSettings* addOnSettings)
317{
318	return B_TRANSLATE("Bayesian Spam Filter");
319}
320
321
322BMailFilter*
323instantiate_filter(BMailProtocol& protocol, const BMailAddOnSettings& settings)
324{
325	return new SpamFilter(protocol, settings);
326}
327