1/******************************************************************************
2 * $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $
3 *
4 * This is a BeOS program for classifying e-mail messages as spam (unwanted
5 * junk mail) or as genuine mail using a Bayesian statistical approach.  There
6 * is also a Mail Daemon Replacement add-on to filter mail using the
7 * classification statistics collected earlier.
8 *
9 * See also http://www.paulgraham.com/spam.html for a good writeup and
10 * http://www.tuxedo.org/~esr/bogofilter/ for another implementation.
11 * And more recently, Gary Robinson's write up of his improved algorithm
12 * at http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
13 * which gives a better spread in spam ratios and slightly fewer
14 * misclassifications.
15 *
16 * Note that this uses the AGMS vacation coding style, not the OpenTracker one.
17 * That means no tabs, indents are two spaces, m_ is the prefix for member
18 * variables, g_ is the prefix for global names, C style comments, constants
19 * are in all capital letters and most other things are mixed case, it's word
20 * wrapped to fit in 79 characters per line to make proofreading on paper
21 * easier, and functions are listed in reverse dependency order so that forward
22 * declarations (function prototypes with no code) aren't needed.
23 *
24 * The Original Design:
25 * There is a spam database (just a file listing words and number of times they
26 * were used in spam and non-spam messages) that a BeMailDaemon input filter
27 * will use when scanning email.  It will mark the mail with the spam
28 * probability (an attribute, optionally a mail header field) and optionally do
29 * something if the probability exceeds a user defined level (delete message,
30 * change subject, file in a different folder).  Or should that be a different
31 * filter?  Outside the mail system, the probability can be used in queries to
32 * find spam.
33 *
34 * A second user application will be used to update the database.  Besides
35 * showing you the current list of words, you can drag and drop files to mark
36 * them as spam or non-spam (a balanced binary tree is used internally to make
37 * word storage fast).  It will add a second attribute to the files to show how
38 * they have been classified by the user (and won't update the database if you
39 * accidentally try to classify a file again).  Besides drag and drop, there
40 * will be a command line interface and a message passing interface.  BeMail
41 * (or other programs) will then communicate via messages to tell it when the
42 * user marks a message as spam or not (via having separate delete spam /
43 * delete genuine mail buttons and a menu item or two).
44 *
45 * Plus lots of details, like the rename swap method to update the database
46 * file (so programs with the old file open aren't affected).  A nice tab text
47 * format so you can open the database in a spreadsheet.  Startup and shutdown
48 * control of the updater from BeMail.  Automatic creation of the indices
49 * needed by the filter.  MIME types for the database file.  Icons for the app.
50 * System settings to enable tracker to display the new attributes when viewing
51 * e-mail (and maybe news articles if someone ever gets around to an NNTP as
52 * files reader).  Documentation.  Recursive directory traversal for the
53 * command line or directory drag and drop.  Options for the updater to warn or
54 * ignore non-email files.  Etc.
55 *
56 * The Actual Implementation:
57 * The spam database updates and the test for spam have been combined into one
58 * program which runs as a server.  That way there won't be as long a delay
59 * when the e-mail system wants to check for spam, because the database is
60 * already loaded by the server and in memory.  The MDR mail filter add-on
61 * simply sends scripting commands to the server (and starts it up if it isn't
62 * already running).  The filter takes care of marking the messages when it
63 * gets the rating back from the server, and then the rest of the mail system
64 * rule chain can delete the message or otherwise manipulate it.
65 *
66 * Revision History (now manually updated due to SVN's philosophy)
67 * $Log: spamdbm.cpp,v $
68 * ------------------------------------------------------------------------
69 * r15195 | agmsmith | 2005-11-27 21:07:55 -0500 (Sun, 27 Nov 2005) | 4 lines
70 * Just a few minutes after checking in, I mentioned it to Japanese expert Koki
71 * and he suggested also including the Japanese comma.  So before I forget to
72 * do it...
73 *
74 * ------------------------------------------------------------------------
75 * r15194 | agmsmith | 2005-11-27 20:37:13 -0500 (Sun, 27 Nov 2005) | 5 lines
76 * Truncate overly long URLs to the maximum word length.  Convert Japanese
77 * periods to spaces so that more "words" are found.  Fix UTF-8 comparison
78 * problems with tolower() incorrectly converting characters with the high bit
79 * set.
80 *
81 * r15098 | agmsmith | 2005-11-23 23:17:00 -0500 (Wed, 23 Nov 2005) | 5 lines
82 * Added better tokenization so that HTML is parsed and things like tags
83 * between letters of a word no longer hide that word.  After testing, the
84 * result seems to be a tighter spread of ratings when done in full text plus
85 * header mode.
86 *
87 * Revision 1.10  2005/11/24 02:08:39  agmsmith
88 * Fixed up prefix codes, Z for things that are inside other things.
89 *
90 * Revision 1.9  2005/11/21 03:28:03  agmsmith
91 * Added a function for extracting URLs.
92 *
93 * Revision 1.8  2005/11/09 03:36:18  agmsmith
94 * Removed noframes detection (doesn't show up in e-mails).  Now use
95 * just H for headers and Z for HTML tag junk.
96 *
97 * Revision 1.7  2005/10/24 00:00:08  agmsmith
98 * Adding HTML tag removal, which also affected the search function so it
99 * could search for single part things like  .
100 *
101 * Revision 1.6  2005/10/17 01:55:08  agmsmith
102 * Remove HTML comments and a few other similar things.
103 *
104 * Revision 1.5  2005/10/16 18:35:36  agmsmith
105 * Under construction - looking into HTML not being in UTF-8.
106 *
107 * Revision 1.4  2005/10/11 01:51:21  agmsmith
108 * Starting on the tokenising passes.  Still need to test asian truncation.
109 *
110 * Revision 1.3  2005/10/06 11:54:07  agmsmith
111 * Not much.
112 *
113 * Revision 1.2  2005/09/12 01:49:37  agmsmith
114 * Enable case folding for the whole file tokenizer.
115 *
116 * r13961 | agmsmith | 2005-08-13 22:25:28 -0400 (Sat, 13 Aug 2005) | 2 lines
117 * Source code changes so that mboxtobemail now compiles and is in the build
118 * system.
119 *
120 * r13959 | agmsmith | 2005-08-13 22:05:27 -0400 (Sat, 13 Aug 2005) | 2 lines
121 * Rename the directory before doing anything else, otherwise svn dies badly.
122 *
123 * r13952 | agmsmith | 2005-08-13 15:31:42 -0400 (Sat, 13 Aug 2005) | 3 lines
124 * Added the resources and file type associations, changed the application
125 * signature and otherwise made the spam detection system work properly again.
126 *
127 * r13951 | agmsmith | 2005-08-13 11:40:01 -0400 (Sat, 13 Aug 2005) | 2 lines
128 * Had to do the file rename as a separate operation due to SVN limitations.
129 *
130 * r13950 | agmsmith | 2005-08-13 11:38:44 -0400 (Sat, 13 Aug 2005) | 3 lines
131 * Oops, "spamdb" is already used for a Unix package.  And spamdatabase is
132 * already reserved by a domain name squatter.  Use "spamdbm" instead.
133 *
134 * r13949 | agmsmith | 2005-08-13 11:17:52 -0400 (Sat, 13 Aug 2005) | 3 lines
135 * Renamed spamfilter to be the more meaningful spamdb (spam database) and
136 * moved it into its own source directory in preparation for adding resources.
137 *
138 * r13628 | agmsmith | 2005-07-10 20:11:29 -0400 (Sun, 10 Jul 2005) | 3 lines
139 * Updated keyword expansion to use SVN keywords.  Also seeing if svn is
140 * working well enough for me to update files from BeOS R5.
141 *
142 * r11909 | axeld | 2005-03-18 19:09:19 -0500 (Fri, 18 Mar 2005) | 2 lines
143 * Moved bin/ directory out of apps/.
144 *
145 * r11769 | bonefish | 2005-03-17 03:30:54 -0500 (Thu, 17 Mar 2005) | 1 line
146 * Move trunk into respective module.
147 *
148 * r10362 | nwhitehorn | 2004-12-06 20:14:05 -0500 (Mon, 06 Dec 2004) | 2 lines
149 * Fixed the spam filter so it works correctly now.
150 *
151 * r9934 | nwhitehorn | 2004-11-11 21:55:05 -0500 (Thu, 11 Nov 2004) | 2 lines
152 * Added AGMS's excellent spam detection software.  Still some weirdness with
153 * the configuration interface from E-mail prefs.
154 *
155 * Revision 1.2  2004/12/07 01:14:05  nwhitehorn
156 * Fixed the spam filter so it works correctly now.
157 *
158 * Revision 1.87  2004/09/20 15:57:26  nwhitehorn
159 * Mostly updated the tree to Be/Haiku style identifier naming conventions.  I
160 * have a few more things to work out, mostly in mail_util.h, and then I'm
161 * proceeding to jamify the build system.  Then we go into Haiku CVS.
162 *
163 * Revision 1.86  2003/07/26 16:47:46  agmsmith
164 * Bug - wasn't allowing double classification if the user had turned on
165 * the option to ignore the previous classification.
166 *
167 * Revision 1.85  2003/07/08 14:52:57  agmsmith
168 * Fix bug with classification choices dialog box coming up with weird
169 * sizes due to RefsReceived message coming in before ReadyToRun had
170 * finished setting up the default sizes of the controls.
171 *
172 * Revision 1.84  2003/07/04 19:59:29  agmsmith
173 * Now with a GUI option to let you declassify messages (set them back
174 * to uncertain, rather than spam or genuine).  Required a BAlert
175 * replacement since BAlerts can't do four buttons.
176 *
177 * Revision 1.83  2003/07/03 20:40:36  agmsmith
178 * Added Uncertain option for declassifying messages.
179 *
180 * Revision 1.82  2003/06/16 14:57:13  agmsmith
181 * Detect spam which uses mislabeled text attachments, going by the file name
182 * extension.
183 *
184 * Revision 1.81  2003/04/08 20:27:04  agmsmith
185 * AGMSBayesianSpamServer now shuts down immediately and returns true if
186 * it is asked to quit by the registrar.
187 *
188 * Revision 1.80  2003/04/07 19:20:27  agmsmith
189 * Ooops, int64 doesn't exist, use long long instead.
190 *
191 * Revision 1.79  2003/04/07 19:05:22  agmsmith
192 * Now with Allen Brunson's atoll for PPC (you need the %lld, but that
193 * becomes %lld on other systems).
194 *
195 * Revision 1.78  2003/04/04 22:43:53  agmsmith
196 * Fixed up atoll PPC processor hack so it would actually work, was just
197 * returning zero which meant that it wouldn't load in the database file
198 * (read the size as zero).
199 *
200 * Revision 1.77  2003/01/22 03:19:48  agmsmith
201 * Don't convert words to lower case, the case is important for spam.
202 * Particularly sentences which start with exciting words, which you
203 * normally won't use at the start of a sentence (and thus capitalize).
204 *
205 * Revision 1.76  2002/12/18 02:29:22  agmsmith
206 * Add space for the Uncertain display in Tracker.
207 *
208 * Revision 1.75  2002/12/18 01:54:37  agmsmith
209 * Added uncertain sound effect.
210 *
211 * Revision 1.74  2002/12/13 23:53:12  agmsmith
212 * Minimize the window before opening it so that it doesn't flash on the
213 * screen in server mode.  Also load the database when the window is
214 * displayed so that the user can see the words.
215 *
216 * Revision 1.73  2002/12/13 20:55:57  agmsmith
217 * Documentation.
218 *
219 * Revision 1.72  2002/12/13 20:26:11  agmsmith
220 * Fixed bug with adding messages in strings to database (was limited to
221 * messages at most 1K long).  Also changed default server mode to true
222 * since that's what people use most.
223 *
224 * Revision 1.71  2002/12/11 22:37:30  agmsmith
225 * Added commands to train on spam and genuine e-mail messages passed
226 * in string arguments rather then via external files.
227 *
228 * Revision 1.70  2002/12/10 22:12:41  agmsmith
229 * Adding a message to the database now uses a BPositionIO rather than a
230 * file and file name (for future string rather than file additions).  Also
231 * now re-evaluate a file after reclassifying it so that the user can see
232 * the new ratio.  Also remove the [Spam 99.9%] subject prefix when doing
233 * a re-evaluation or classification (the number would be wrong).
234 *
235 * Revision 1.69  2002/12/10 01:46:04  agmsmith
236 * Added the Chi-Squared scoring method.
237 *
238 * Revision 1.68  2002/11/29 22:08:25  agmsmith
239 * Change default purge age to 2000 so that hitting the purge button
240 * doesn't erase stuff from the new sample database.
241 *
242 * Revision 1.67  2002/11/25 20:39:39  agmsmith
243 * Don't need to massage the MIME type since the mail library now does
244 * the lower case conversion and converts TEXT to text/plain too.
245 *
246 * Revision 1.66  2002/11/20 22:57:12  nwhitehorn
247 * PPC Compatibility Fixes
248 *
249 * Revision 1.65  2002/11/10 18:43:55  agmsmith
250 * Added a time delay to some quitting operations so that scripting commands
251 * from a second client (like a second e-mail account) will make the program
252 * abort the quit operation.
253 *
254 * Revision 1.64  2002/11/05 18:05:16  agmsmith
255 * Looked at Nathan's PPC changes (thanks!), modified style a bit.
256 *
257 * Revision 1.63  2002/11/04 03:30:22  nwhitehorn
258 * Now works (or compiles at least) on PowerPC.  I'll get around to testing it
259 * later.
260 *
261 * Revision 1.62  2002/11/04 01:03:33  agmsmith
262 * Fixed warnings so it compiles under the bemaildaemon system.
263 *
264 * Revision 1.61  2002/11/03 23:00:37  agmsmith
265 * Added to the bemaildaemon project on SourceForge.  Hmmmm, seems to switch to
266 * a new version if I commit and specify a message, but doesn't accept the
267 * message and puts up the text editor.  Must be a bug where cvs eats the first
268 * option after "commit".
269 *
270 * Revision 1.60.1.1  2002/10/22 14:29:27  agmsmith
271 * Needed to recompile with the original Libmail.so from Beta/1 since
272 * the current library uses a different constructor, and thus wouldn't
273 * run when used with the old library.
274 *
275 * Revision 1.60  2002/10/21 16:41:27  agmsmith
276 * Return a special error code when no words are found in a message,
277 * so that messages without text/plain parts can be recognized as
278 * spam by the mail filter.
279 *
280 * Revision 1.59  2002/10/20 21:29:47  agmsmith
281 * Watch out for MIME types of "text", treat as text/plain.
282 *
283 * Revision 1.58  2002/10/20 18:29:07  agmsmith
284 * *** empty log message ***
285 *
286 * Revision 1.57  2002/10/20 18:25:02  agmsmith
287 * Fix case sensitivity in MIME type tests, and fix text/any test.
288 *
289 * Revision 1.56  2002/10/19 17:00:10  agmsmith
290 * Added the pop-up menu for the tokenize modes.
291 *
292 * Revision 1.55  2002/10/19 14:54:06  agmsmith
293 * Fudge MIME type of body text components so that they get
294 * treated as text.
295 *
296 * Revision 1.54  2002/10/19 00:56:37  agmsmith
297 * The parsing of e-mail messages seems to be working now, just need
298 * to add some user interface stuff for the tokenizing mode.
299 *
300 * Revision 1.53  2002/10/18 23:37:56  agmsmith
301 * More mail kit usage, can now decode headers, but more to do.
302 *
303 * Revision 1.52  2002/10/16 23:52:33  agmsmith
304 * Getting ready to add more tokenizing modes, exploring Mail Kit to break
305 * apart messages into components (and decode BASE64 and other encodings).
306 *
307 * Revision 1.51  2002/10/11 20:05:31  agmsmith
308 * Added installation of sound effect names, which the filter will use.
309 *
310 * Revision 1.50  2002/10/02 16:50:02  agmsmith
311 * Forgot to add credits to the algorithm inventors.
312 *
313 * Revision 1.49  2002/10/01 00:39:29  agmsmith
314 * Added drag and drop to evaluate files or to add them to the list.
315 *
316 * Revision 1.48  2002/09/30 19:44:17  agmsmith
317 * Switched to Gary Robinson's method, removed max spam/genuine word.
318 *
319 * Revision 1.47  2002/09/23 17:08:55  agmsmith
320 * Add an attribute with the spam ratio to files which have been evaluated.
321 *
322 * Revision 1.46  2002/09/23 02:50:32  agmsmith
323 * Fiddling with display width of e-mail attributes.
324 *
325 * Revision 1.45  2002/09/23 01:13:56  agmsmith
326 * Oops, bug in string evaluation scripting.
327 *
328 * Revision 1.44  2002/09/22 21:00:55  agmsmith
329 * Added EvaluateString so that the BeMail add-on can pass the info without
330 * having to create a temporary file.
331 *
332 * Revision 1.43  2002/09/20 19:56:02  agmsmith
333 * Added about box and button for estimating the spam ratio of a file.
334 *
335 * Revision 1.42  2002/09/20 01:22:26  agmsmith
336 * More testing, decide that an extreme ratio bias point of 0.5 is good.
337 *
338 * Revision 1.41  2002/09/19 21:17:12  agmsmith
339 * Changed a few names and proofread the program.
340 *
341 * Revision 1.40  2002/09/19 14:27:17  agmsmith
342 * Rearranged execution of commands, moving them to a separate looper
343 * rather than the BApplication, so that thousands of files could be
344 * processed without worrying about the message queue filling up.
345 *
346 * Revision 1.39  2002/09/18 18:47:16  agmsmith
347 * Stop flickering when the view is partially obscured, update cached
348 * values in all situations except when app is busy.
349 *
350 * Revision 1.38  2002/09/18 18:08:11  agmsmith
351 * Add a function for evaluating the spam ratio of a message.
352 *
353 * Revision 1.37  2002/09/16 01:30:16  agmsmith
354 * Added Get Oldest command.
355 *
356 * Revision 1.36  2002/09/16 00:47:52  agmsmith
357 * Change the display to counter-weigh the spam ratio by the number of
358 * messages.
359 *
360 * Revision 1.35  2002/09/15 20:49:35  agmsmith
361 * Scrolling improved, buttons, keys and mouse wheel added.
362 *
363 * Revision 1.34  2002/09/15 03:46:10  agmsmith
364 * Up and down buttons under construction.
365 *
366 * Revision 1.33  2002/09/15 02:09:21  agmsmith
367 * Took out scroll bar.
368 *
369 * Revision 1.32  2002/09/15 02:05:30  agmsmith
370 * Trying to add a scroll bar, but it isn't very useful.
371 *
372 * Revision 1.31  2002/09/14 23:06:28  agmsmith
373 * Now has live updates of the list of words.
374 *
375 * Revision 1.30  2002/09/14 19:53:11  agmsmith
376 * Now with a better display of the words.
377 *
378 * Revision 1.29  2002/09/13 21:33:54  agmsmith
379 * Now draws the words in the word display view, but still primitive.
380 *
381 * Revision 1.28  2002/09/13 19:28:02  agmsmith
382 * Added display of most genuine and most spamiest, fixed up cursor.
383 *
384 * Revision 1.27  2002/09/13 03:08:42  agmsmith
385 * Show current word and message counts, and a busy cursor.
386 *
387 * Revision 1.26  2002/09/13 00:00:08  agmsmith
388 * Fixed up some deadlock problems, now using asynchronous message replies.
389 *
390 * Revision 1.25  2002/09/12 17:56:58  agmsmith
391 * Keep track of words which are spamiest and genuinest.
392 *
393 * Revision 1.24  2002/09/12 01:57:10  agmsmith
394 * Added server mode.
395 *
396 * Revision 1.23  2002/09/11 23:30:45  agmsmith
397 * Added Purge button and ignore classification checkbox.
398 *
399 * Revision 1.22  2002/09/11 21:23:13  agmsmith
400 * Added bulk update choice, purge button, moved to a BView container
401 * for all the controls (so background colour could be set, and Pulse
402 * works normally for it too).
403 *
404 * Revision 1.21  2002/09/10 22:52:49  agmsmith
405 * You can now change the database name in the GUI.
406 *
407 * Revision 1.20  2002/09/09 14:20:42  agmsmith
408 * Now can have multiple backups, and implemented refs received.
409 *
410 * Revision 1.19  2002/09/07 19:14:56  agmsmith
411 * Added standard GUI measurement code.
412 *
413 * Revision 1.18  2002/09/06 21:03:03  agmsmith
414 * Rearranging code to avoid forward references when adding a window class.
415 *
416 * Revision 1.17  2002/09/06 02:54:00  agmsmith
417 * Added the ability to purge old words from the database.
418 *
419 * Revision 1.16  2002/09/05 00:46:03  agmsmith
420 * Now adds spam to the database!
421 *
422 * Revision 1.15  2002/09/04 20:32:15  agmsmith
423 * Read ahead a couple of letters to decode quoted-printable better.
424 *
425 * Revision 1.14  2002/09/04 03:10:03  agmsmith
426 * Can now tokenize (break into words) a text file.
427 *
428 * Revision 1.13  2002/09/03 21:50:54  agmsmith
429 * Count database command, set up MIME type for the database file.
430 *
431 * Revision 1.12  2002/09/03 19:55:54  agmsmith
432 * Added loading and saving the database.
433 *
434 * Revision 1.11  2002/09/02 03:35:33  agmsmith
435 * Create indices and set up attribute associations with the e-mail MIME type.
436 *
437 * Revision 1.10  2002/09/01 15:52:49  agmsmith
438 * Can now delete the database.
439 *
440 * Revision 1.9  2002/08/31 21:55:32  agmsmith
441 * Yet more scripting.
442 *
443 * Revision 1.8  2002/08/31 21:41:37  agmsmith
444 * Under construction, with example code to decode a B_REPLY.
445 *
446 * Revision 1.7  2002/08/30 19:29:06  agmsmith
447 * Combined loading and saving settings into one function.
448 *
449 * Revision 1.6  2002/08/30 02:01:10  agmsmith
450 * Working on loading and saving settings.
451 *
452 * Revision 1.5  2002/08/29 23:17:42  agmsmith
453 * More scripting.
454 *
455 * Revision 1.4  2002/08/28 00:40:52  agmsmith
456 * Scripting now seems to work, at least the messages flow properly.
457 *
458 * Revision 1.3  2002/08/25 21:51:44  agmsmith
459 * Getting the about text formatting right.
460 *
461 * Revision 1.2  2002/08/25 21:28:20  agmsmith
462 * Trying out the BeOS scripting system as a way of implementing the program.
463 *
464 * Revision 1.1  2002/08/24 02:27:51  agmsmith
465 * Initial revision
466 */
467
468/* Standard C Library. */
469
470#include <errno.h>
471#include <stdio.h>
472#include <stdlib.h>
473#include <strings.h>
474
475/* Standard C++ library. */
476
477#include <iostream>
478
479/* STL (Standard Template Library) headers. */
480
481#include <map>
482#include <queue>
483#include <set>
484#include <string>
485#include <vector>
486
487using namespace std;
488
489/* BeOS (Be Operating System) headers. */
490
491#include <Alert.h>
492#include <Application.h>
493#include <Beep.h>
494#include <Button.h>
495#include <CheckBox.h>
496#include <Cursor.h>
497#include <Directory.h>
498#include <Entry.h>
499#include <File.h>
500#include <FilePanel.h>
501#include <FindDirectory.h>
502#include <fs_index.h>
503#include <fs_info.h>
504#include <MenuBar.h>
505#include <MenuItem.h>
506#include <Message.h>
507#include <MessageQueue.h>
508#include <MessageRunner.h>
509#include <Mime.h>
510#include <NodeInfo.h>
511#include <Path.h>
512#include <Picture.h>
513#include <PictureButton.h>
514#include <Point.h>
515#include <Polygon.h>
516#include <PopUpMenu.h>
517#include <PropertyInfo.h>
518#include <RadioButton.h>
519#include <Resources.h>
520#include <Screen.h>
521#include <ScrollBar.h>
522#include <String.h>
523#include <StringView.h>
524#include <TextControl.h>
525#include <View.h>
526
527/* Included from the Mail Daemon Replacement project (MDR) include/public
528directory, available from http://sourceforge.net/projects/bemaildaemon/ */
529
530#include <MailMessage.h>
531#include <MailAttachment.h>
532
533
534/******************************************************************************
535 * Global variables, and not-so-variable things too.  Grouped by functionality.
536 */
537
538static float g_MarginBetweenControls; /* Space of a letter "M" between them. */
539static float g_LineOfTextHeight;      /* Height of text the current font. */
540static float g_StringViewHeight;      /* Height of a string view text box. */
541static float g_ButtonHeight;          /* How many pixels tall buttons are. */
542static float g_CheckBoxHeight;        /* Same for check boxes. */
543static float g_RadioButtonHeight;     /* Also for radio buttons. */
544static float g_PopUpMenuHeight;       /* Again for pop-up menus. */
545static float g_TextBoxHeight;         /* Ditto for editable text controls. */
546
547static const char *g_ABSAppSignature =
548  "application/x-vnd.agmsmith.spamdbm";
549
550static const char *g_ABSDatabaseFileMIMEType =
551  "text/x-vnd.agmsmith.spam_probability_database";
552
553static const char *g_DefaultDatabaseFileName =
554  "SpamDBM Database";
555
556static const char *g_DatabaseRecognitionString =
557  "Spam Database File";
558
559static const char *g_AttributeNameClassification = "MAIL:classification";
560static const char *g_AttributeNameSpamRatio = "MAIL:ratio_spam";
561static const char *g_BeepGenuine = "SpamFilter-Genuine";
562static const char *g_BeepSpam = "SpamFilter-Spam";
563static const char *g_BeepUncertain = "SpamFilter-Uncertain";
564static const char *g_ClassifiedSpam = "Spam";
565static const char *g_ClassifiedGenuine = "Genuine";
566static const char *g_DataName = "data";
567static const char *g_ResultName = "result";
568
569static const char *g_SettingsDirectoryName = "Mail";
570static const char *g_SettingsFileName = "SpamDBM Settings";
571static const uint32 g_SettingsWhatCode = 'SDBM';
572static const char *g_BackupSuffix = ".backup %d";
573static const int g_MaxBackups = 10; /* Numbered from 0 to g_MaxBackups - 1. */
574static const size_t g_MaxWordLength = 50; /* Words longer than this aren't. */
575static const int g_MaxInterestingWords = 150; /* Top N words are examined. */
576static const double g_RobinsonS = 0.45; /* Default weight for no data. */
577static const double g_RobinsonX = 0.5; /* Halfway point for no data. */
578
579static bool g_CommandLineMode;
580  /* TRUE if the program was started from the command line (and thus should
581  exit after processing the command), FALSE if it is running with a graphical
582  user interface. */
583
584static bool g_ServerMode;
585  /* When TRUE the program runs in server mode - error messages don't result in
586  pop-up dialog boxes, but you can still see them in stderr.  Also the window
587  is minimized, if it exists. */
588
589static int g_QuitCountdown = -1;
590  /* Set to the number of pulse timing events (about one every half second) to
591  count down before the program quits.  Negative means stop counting.  Zero
592  means quit at the next pulse event.  This is used to keep the program alive
593  for a short while after someone requests that it quit, in case more scripting
594  commands come in, which will stop the countdown.  Needed to handle the case
595  where there are multiple e-mail accounts all requesting spam identification,
596  and one finishes first and tells the server to quit.  It also checks to see
597  that there is no more work to do before trying to quit. */
598
599static volatile bool g_AppReadyToRunCompleted = false;
600  /* The BApplication starts processing messages before ReadyToRun finishes,
601  which can lead to initialisation problems (button heights not determined).
602  So wait for this to turn TRUE in code that might run early, like
603  RefsReceived. */
604
605static class CommanderLooper *g_CommanderLooperPntr = NULL;
606static BMessenger *g_CommanderMessenger = NULL;
607  /* Some globals for use with the looper which processes external commands
608  (arguments received, file references received), needed for avoiding deadlocks
609  which would happen if the BApplication sent a scripting message to itself. */
610
611static BCursor *g_BusyCursor = NULL;
612  /* The busy cursor, will be loaded from the resource file during application
613  startup. */
614
615typedef enum PropertyNumbersEnum
616{
617  PN_DATABASE_FILE = 0,
618  PN_SPAM,
619  PN_SPAM_STRING,
620  PN_GENUINE,
621  PN_GENUINE_STRING,
622  PN_UNCERTAIN,
623  PN_IGNORE_PREVIOUS_CLASSIFICATION,
624  PN_SERVER_MODE,
625  PN_FLUSH,
626  PN_PURGE_AGE,
627  PN_PURGE_POPULARITY,
628  PN_PURGE,
629  PN_OLDEST,
630  PN_EVALUATE,
631  PN_EVALUATE_STRING,
632  PN_RESET_TO_DEFAULTS,
633  PN_INSTALL_THINGS,
634  PN_TOKENIZE_MODE,
635  PN_SCORING_MODE,
636  PN_MAX
637} PropertyNumbers;
638
639static const char * g_PropertyNames [PN_MAX] =
640{
641  "DatabaseFile",
642  "Spam",
643  "SpamString",
644  "Genuine",
645  "GenuineString",
646  "Uncertain",
647  "IgnorePreviousClassification",
648  "ServerMode",
649  "Flush",
650  "PurgeAge",
651  "PurgePopularity",
652  "Purge",
653  "Oldest",
654  "Evaluate",
655  "EvaluateString",
656  "ResetToDefaults",
657  "InstallThings",
658  "TokenizeMode",
659  "ScoringMode"
660};
661
662/* This array lists the scripting commands we can handle, in a format that the
663scripting system can understand too. */
664
665static struct property_info g_ScriptingPropertyList [] =
666{
667  /* *name; commands[10]; specifiers[10]; *usage; extra_data; ... */
668  {g_PropertyNames[PN_DATABASE_FILE], {B_GET_PROPERTY, 0},
669    {B_DIRECT_SPECIFIER, 0}, "Get the pathname of the current database file.  "
670    "The default name is something like B_USER_SETTINGS_DIRECTORY / "
671    "Mail / SpamDBM Database", PN_DATABASE_FILE,
672    {}, {}, {}},
673  {g_PropertyNames[PN_DATABASE_FILE], {B_SET_PROPERTY, 0},
674    {B_DIRECT_SPECIFIER, 0}, "Change the pathname of the database file to "
675    "use.  It will automatically be converted to an absolute path name, "
676    "so make sure the parent directories exist before setting it.  If it "
677    "doesn't exist, you'll have to use the create command next.",
678    PN_DATABASE_FILE, {}, {}, {}},
679  {g_PropertyNames[PN_DATABASE_FILE], {B_CREATE_PROPERTY, 0},
680    {B_DIRECT_SPECIFIER, 0}, "Creates a new empty database, will replace "
681    "the existing database file too.", PN_DATABASE_FILE, {}, {}, {}},
682  {g_PropertyNames[PN_DATABASE_FILE], {B_DELETE_PROPERTY, 0},
683    {B_DIRECT_SPECIFIER, 0}, "Deletes the database file and all backup copies "
684    "of that file too.  Really only of use for uninstallers.",
685    PN_DATABASE_FILE, {}, {}, {}},
686  {g_PropertyNames[PN_DATABASE_FILE], {B_COUNT_PROPERTIES, 0},
687    {B_DIRECT_SPECIFIER, 0}, "Returns the number of words in the database.",
688    PN_DATABASE_FILE, {}, {}, {}},
689  {g_PropertyNames[PN_SPAM], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
690    "Adds the spam in the given file (specify full pathname to be safe) to "
691    "the database.  The words in the files will be added to the list of words "
692    "in the database that identify spam messages.  The files processed will "
693    "also have the attribute MAIL:classification added with a value of "
694    "\"Spam\" or \"Genuine\" as specified.  They also have their spam ratio "
695    "attribute updated, as if you had also used the Evaluate command on "
696    "them.  If they already have the MAIL:classification "
697    "attribute and it matches the new classification then they won't get "
698    "processed (and if it is different, they will get removed from the "
699    "statistics for the old class and added to the statistics for the new "
700    "one).  You can turn off that behaviour with the "
701    "IgnorePreviousClassification property.  The command line version lets "
702    "you specify more than one pathname.", PN_SPAM, {}, {}, {}},
703  {g_PropertyNames[PN_SPAM], {B_COUNT_PROPERTIES, 0}, {B_DIRECT_SPECIFIER, 0},
704    "Returns the number of spam messages in the database.", PN_SPAM,
705    {}, {}, {}},
706  {g_PropertyNames[PN_SPAM_STRING], {B_SET_PROPERTY, 0},
707    {B_DIRECT_SPECIFIER, 0}, "Adds the spam in the given string (assumed to "
708    "be the text of a whole e-mail message, not just a file name) to the "
709    "database.", PN_SPAM_STRING, {}, {}, {}},
710  {g_PropertyNames[PN_GENUINE], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
711    "Similar to adding spam except that the message file is added to the "
712    "genuine statistics.", PN_GENUINE, {}, {}, {}},
713  {g_PropertyNames[PN_GENUINE], {B_COUNT_PROPERTIES, 0},
714    {B_DIRECT_SPECIFIER, 0}, "Returns the number of genuine messages in the "
715    "database.", PN_GENUINE, {}, {}, {}},
716  {g_PropertyNames[PN_GENUINE_STRING], {B_SET_PROPERTY, 0},
717    {B_DIRECT_SPECIFIER, 0}, "Adds the genuine message in the given string "
718    "(assumed to be the text of a whole e-mail message, not just a file name) "
719    "to the database.", PN_GENUINE_STRING, {}, {}, {}},
720  {g_PropertyNames[PN_UNCERTAIN], {B_SET_PROPERTY, 0}, {B_DIRECT_SPECIFIER, 0},
721    "Similar to adding spam except that the message file is removed from the "
722    "database, undoing the previous classification.  Obviously, it needs to "
723    "have been classified previously (using the file attributes) so it can "
724    "tell if it is removing spam or genuine words.", PN_UNCERTAIN, {}, {}, {}},
725  {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_SET_PROPERTY, 0},
726    {B_DIRECT_SPECIFIER, 0}, "If set to true then the previous classification "
727    "(which was saved as an attribute of the e-mail message file) will be "
728    "ignored, so that you can add the message to the database again.  If set "
729    "to false (the normal case), the attribute will be examined, and if the "
730    "message has already been classified as what you claim it is, nothing "
731    "will be done.  If it was misclassified, then the message will be removed "
732    "from the statistics for the old class and added to the stats for the "
733    "new classification you have requested.",
734    PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
735  {g_PropertyNames[PN_IGNORE_PREVIOUS_CLASSIFICATION], {B_GET_PROPERTY, 0},
736    {B_DIRECT_SPECIFIER, 0}, "Find out the current setting of the flag for "
737    "ignoring the previously recorded classification.",
738    PN_IGNORE_PREVIOUS_CLASSIFICATION, {}, {}, {}},
739  {g_PropertyNames[PN_SERVER_MODE], {B_SET_PROPERTY, 0},
740    {B_DIRECT_SPECIFIER, 0}, "If set to true then error messages get printed "
741    "to the standard error stream rather than showing up in an alert box.  "
742    "It also starts up with the window minimized.", PN_SERVER_MODE,
743    {}, {}, {}},
744  {g_PropertyNames[PN_SERVER_MODE], {B_GET_PROPERTY, 0},
745    {B_DIRECT_SPECIFIER, 0}, "Find out the setting of the server mode flag.",
746    PN_SERVER_MODE, {}, {}, {}},
747  {g_PropertyNames[PN_FLUSH], {B_EXECUTE_PROPERTY, 0},
748    {B_DIRECT_SPECIFIER, 0}, "Writes out the database file to disk, if it has "
749    "been updated in memory but hasn't been saved to disk.  It will "
750    "automatically get written when the program exits, so this command is "
751    "mostly useful for server mode.", PN_FLUSH, {}, {}, {}},
752  {g_PropertyNames[PN_PURGE_AGE], {B_SET_PROPERTY, 0},
753    {B_DIRECT_SPECIFIER, 0}, "Sets the old age limit.  Words which haven't "
754      "been updated since this many message additions to the database may be "
755      "deleted when you do a purge.  A good value is 1000, meaning that if a "
756      "word hasn't appeared in the last 1000 spam/genuine messages, it will "
757      "be forgotten.  Zero will purge all words, 1 will purge words not in "
758      "the last message added to the database, 2 will purge words not in the "
759      "last two messages added, and so on.  This is mostly useful for "
760      "removing those one time words which are often hunks of binary garbage, "
761      "not real words.  This acts in combination with the popularity limit; "
762      "both conditions have to be valid before the word gets deleted.",
763      PN_PURGE_AGE, {}, {}, {}},
764  {g_PropertyNames[PN_PURGE_AGE], {B_GET_PROPERTY, 0},
765    {B_DIRECT_SPECIFIER, 0}, "Gets the old age limit.", PN_PURGE_AGE,
766    {}, {}, {}},
767  {g_PropertyNames[PN_PURGE_POPULARITY], {B_SET_PROPERTY, 0},
768    {B_DIRECT_SPECIFIER, 0}, "Sets the popularity limit.  Words which aren't "
769    "this popular may be deleted when you do a purge.  A good value is 5, "
770    "which means that the word is safe from purging if it has been seen in 6 "
771    "or more e-mail messages.  If it's only in 5 or less, then it may get "
772    "purged.  The extreme is zero, where only words that haven't been seen "
773    "in any message are deleted (usually means no words).  This acts in "
774    "combination with the old age limit; both conditions have to be valid "
775    "before the word gets deleted.", PN_PURGE_POPULARITY, {}, {}, {}},
776  {g_PropertyNames[PN_PURGE_POPULARITY], {B_GET_PROPERTY, 0},
777    {B_DIRECT_SPECIFIER, 0}, "Gets the purge popularity limit.",
778    PN_PURGE_POPULARITY, {}, {}, {}},
779  {g_PropertyNames[PN_PURGE], {B_EXECUTE_PROPERTY, 0},
780    {B_DIRECT_SPECIFIER, 0}, "Purges the old obsolete words from the "
781    "database, if they are old enough according to the age limit and also "
782    "unpopular enough according to the popularity limit.", PN_PURGE,
783    {}, {}, {}},
784  {g_PropertyNames[PN_OLDEST], {B_GET_PROPERTY, 0},
785    {B_DIRECT_SPECIFIER, 0}, "Gets the age of the oldest message in the "
786    "database.  It's relative to the beginning of time, so you need to do "
787    "(total messages - age - 1) to see how many messages ago it was added.",
788    PN_OLDEST, {}, {}, {}},
789  {g_PropertyNames[PN_EVALUATE], {B_SET_PROPERTY, 0},
790    {B_DIRECT_SPECIFIER, 0}, "Evaluates a given file (by path name) to see "
791    "if it is spam or not.  Returns the ratio of spam probability vs genuine "
792    "probability, 0.0 meaning completely genuine, 1.0 for completely spam.  "
793    "Normally you should safely be able to consider it as spam if it is over "
794    "0.56 for the Robinson scoring method.  For the ChiSquared method, the "
795    "numbers are near 0 for genuine, near 1 for spam, and anywhere in the "
796    "middle means it can't decide.  The program attaches a MAIL:ratio_spam "
797    "attribute with the ratio as its "
798    "float32 value to the file.  Also returns the top few interesting words "
799    "in \"words\" and the associated per-word probability ratios in "
800    "\"ratios\".", PN_EVALUATE, {}, {}, {}},
801  {g_PropertyNames[PN_EVALUATE_STRING], {B_SET_PROPERTY, 0},
802    {B_DIRECT_SPECIFIER, 0}, "Like Evaluate, but rather than a file name, "
803    "the string argument contains the entire text of the message to be "
804    "evaluated.", PN_EVALUATE_STRING, {}, {}, {}},
805  {g_PropertyNames[PN_RESET_TO_DEFAULTS], {B_EXECUTE_PROPERTY, 0},
806    {B_DIRECT_SPECIFIER, 0}, "Resets all the configuration options to the "
807    "default values, including the database name.", PN_RESET_TO_DEFAULTS,
808    {}, {}, {}},
809  {g_PropertyNames[PN_INSTALL_THINGS], {B_EXECUTE_PROPERTY, 0},
810    {B_DIRECT_SPECIFIER, 0}, "Creates indices for the MAIL:classification and "
811    "MAIL:ratio_spam attributes on all volumes which support BeOS queries, "
812    "identifies them to the system as e-mail related attributes (modifies "
813    "the text/x-email MIME type), and sets up the new MIME type "
814    "(text/x-vnd.agmsmith.spam_probability_database) for the database file.  "
815    "Also registers names for the sound effects used by the separate filter "
816    "program (use the installsound BeOS program or the Sounds preferences "
817    "program to associate sound files with the names).", PN_INSTALL_THINGS,
818    {}, {}, {}},
819  {g_PropertyNames[PN_TOKENIZE_MODE], {B_SET_PROPERTY, 0},
820    {B_DIRECT_SPECIFIER, 0}, "Sets the method used for breaking up the "
821    "message into words.  Use \"Whole\" for the whole file (also use it for "
822    "non-email files).  The file isn't broken into parts; the whole thing is "
823    "converted into words, headers and attachments are just more raw data.  "
824    "Well, not quite raw data since it converts quoted-printable codes "
825    "(equals sign followed by hex digits or end of line) to the equivalent "
826    "single characters.  \"PlainText\" breaks the file into MIME components "
827    "and only looks at the ones which are of MIME type text/plain.  "
828    "\"AnyText\" will look for words in all text/* things, including "
829    "text/html attachments.  \"AllParts\" will decode all message components "
830    "and look for words in them, including binary attachments.  "
831    "\"JustHeader\" will only look for words in the message header.  "
832    "\"AllPartsAndHeader\", \"PlainTextAndHeader\" and \"AnyTextAndHeader\" "
833    "will also include the words from the message headers.", PN_TOKENIZE_MODE,
834    {}, {}, {}},
835  {g_PropertyNames[PN_TOKENIZE_MODE], {B_GET_PROPERTY, 0},
836    {B_DIRECT_SPECIFIER, 0}, "Gets the method used for breaking up the "
837    "message into words.", PN_TOKENIZE_MODE, {}, {}, {}},
838  {g_PropertyNames[PN_SCORING_MODE], {B_SET_PROPERTY, 0},
839    {B_DIRECT_SPECIFIER, 0}, "Sets the method used for combining the "
840    "probabilities of individual words into an overall score.  "
841    "\"Robinson\" mode will use Gary Robinson's nth root of the product "
842    "method.  It gives a nice range of values between 0 and 1 so you can "
843    "see shades of spaminess.  The cutoff point between spam and genuine "
844    "varies depending on your database of words (0.56 was one point in "
845    "some experiments).  \"ChiSquared\" mode will use chi-squared "
846    "statistics to evaluate the difference in probabilities that the lists "
847    "of word ratios are random.  The result is very close to 0 for genuine "
848    "and very close to 1 for spam, and near the middle if it is uncertain.",
849    PN_SCORING_MODE, {}, {}, {}},
850  {g_PropertyNames[PN_SCORING_MODE], {B_GET_PROPERTY, 0},
851    {B_DIRECT_SPECIFIER, 0}, "Gets the method used for combining the "
852    "individual word ratios into an overall score.", PN_SCORING_MODE,
853    {}, {}, {}},
854
855  { 0 }
856};
857
858
859/* The various scoring modes as text and enums.  See PN_SCORING_MODE. */
860
861typedef enum ScoringModeEnum
862{
863  SM_ROBINSON = 0,
864  SM_CHISQUARED,
865  SM_MAX
866} ScoringModes;
867
868static const char * g_ScoringModeNames [SM_MAX] =
869{
870  "Robinson",
871  "ChiSquared"
872};
873
874
875/* The various tokenizing modes as text and enums.  See PN_TOKENIZE_MODE. */
876
877typedef enum TokenizeModeEnum
878{
879  TM_WHOLE = 0,
880  TM_PLAIN_TEXT,
881  TM_PLAIN_TEXT_HEADER,
882  TM_ANY_TEXT,
883  TM_ANY_TEXT_HEADER,
884  TM_ALL_PARTS,
885  TM_ALL_PARTS_HEADER,
886  TM_JUST_HEADER,
887  TM_MAX
888} TokenizeModes;
889
890static const char * g_TokenizeModeNames [TM_MAX] =
891{
892  "All",
893  "Plain text",
894  "Plain text and header",
895  "Any text",
896  "Any text and header",
897  "All parts",
898  "All parts and header",
899  "Just header"
900};
901
902
903/* Possible message classifications. */
904
905typedef enum ClassificationTypesEnum
906{
907  CL_GENUINE = 0,
908  CL_SPAM,
909  CL_UNCERTAIN,
910  CL_MAX
911} ClassificationTypes;
912
913static const char * g_ClassificationTypeNames [CL_MAX] =
914{
915  g_ClassifiedGenuine,
916  g_ClassifiedSpam,
917  "Uncertain"
918};
919
920
921/* Some polygon graphics for the scroll arrows. */
922
923static BPoint g_UpLinePoints [] =
924{
925  BPoint (8, 2 * (1)),
926  BPoint (14, 2 * (6)),
927  BPoint (10, 2 * (6)),
928  BPoint (10, 2 * (13)),
929  BPoint (6, 2 * (13)),
930  BPoint (6, 2 * (6)),
931  BPoint (2, 2 * (6))
932};
933
934static BPoint g_DownLinePoints [] =
935{
936  BPoint (8, 2 * (14-1)),
937  BPoint (14, 2 * (14-6)),
938  BPoint (10, 2 * (14-6)),
939  BPoint (10, 2 * (14-13)),
940  BPoint (6, 2 * (14-13)),
941  BPoint (6, 2 * (14-6)),
942  BPoint (2, 2 * (14-6))
943};
944
945static BPoint g_UpPagePoints [] =
946{
947  BPoint (8, 2 * (1)),
948  BPoint (13, 2 * (6)),
949  BPoint (10, 2 * (6)),
950  BPoint (14, 2 * (10)),
951  BPoint (10, 2 * (10)),
952  BPoint (10, 2 * (13)),
953  BPoint (6, 2 * (13)),
954  BPoint (6, 2 * (10)),
955  BPoint (2, 2 * (10)),
956  BPoint (6, 2 * (6)),
957  BPoint (3, 2 * (6))
958};
959
960static BPoint g_DownPagePoints [] =
961{
962  BPoint (8, 2 * (14-1)),
963  BPoint (13, 2 * (14-6)),
964  BPoint (10, 2 * (14-6)),
965  BPoint (14, 2 * (14-10)),
966  BPoint (10, 2 * (14-10)),
967  BPoint (10, 2 * (14-13)),
968  BPoint (6, 2 * (14-13)),
969  BPoint (6, 2 * (14-10)),
970  BPoint (2, 2 * (14-10)),
971  BPoint (6, 2 * (14-6)),
972  BPoint (3, 2 * (14-6))
973};
974
975
976/* An array of flags to identify characters which are considered to be spaces.
977If character code X has g_SpaceCharacters[X] set to true then it is a
978space-like character.  Character codes 128 and above are always non-space since
979they are UTF-8 characters.  Initialised in the ABSApp constructor. */
980
981static bool g_SpaceCharacters [128];
982
983
984
985/******************************************************************************
986 * Each word in the spam database gets one of these structures.  The database
987 * has a string (the word) as the key and this structure as the value
988 * (statistics for that word).
989 */
990
991typedef struct StatisticsStruct
992{
993  uint32 age;
994    /* Sequence number for the time when this word was last updated in the
995    database, so that we can remove old words (haven't been seen in recent
996    spam).  It's zero for the first file ever added (spam or genuine) to the
997    database, 1 for all words added or updated by the second file, etc.  If a
998    later file updates an existing word, it gets the age of the later file. */
999
1000  uint32 genuineCount;
1001    /* Number of genuine messages that have this word. */
1002
1003  uint32 spamCount;
1004    /* A count of the number of spam e-mail messages which contain the word. */
1005
1006} StatisticsRecord, *StatisticsPointer;
1007
1008typedef map<string, StatisticsRecord> StatisticsMap;
1009  /* Define this type which will be used for our main data storage facility, so
1010  we can more conveniently specify things that are derived from it, like
1011  iterators. */
1012
1013
1014
1015/******************************************************************************
1016 * An alert box asking how the user wants to mark messages.  There are buttons
1017 * for each classification category, and a checkbox to mark all remaining N
1018 * messages the same way.  And a cancel button.  To use it, first create the
1019 * ClassificationChoicesWindow, specifying the input arguments.  Then call the
1020 * Go method which will show the window, stuff the user's answer into your
1021 * output arguments (class set to CL_MAX if the user cancels), and destroy the
1022 * window.  Implemented because BAlert only allows 3 buttons, max!
1023 */
1024
1025class ClassificationChoicesWindow : public BWindow
1026{
1027public:
1028  /* Constructor and destructor. */
1029  ClassificationChoicesWindow (BRect FrameRect,
1030    const char *FileName, int NumberOfFiles);
1031
1032  /* BeOS virtual functions. */
1033  virtual void MessageReceived (BMessage *MessagePntr);
1034
1035  /* Our methods. */
1036  void Go (bool *BulkModeSelectedPntr,
1037    ClassificationTypes *ChoosenClassificationPntr);
1038
1039  /* Various message codes for various buttons etc. */
1040  static const uint32 MSG_CLASS_BUTTONS = 'ClB0';
1041  static const uint32 MSG_CANCEL_BUTTON = 'Cncl';
1042  static const uint32 MSG_BULK_CHECKBOX = 'BlkK';
1043
1044private:
1045  /* Member variables. */
1046  bool *m_BulkModeSelectedPntr;
1047  ClassificationTypes *m_ChoosenClassificationPntr;
1048};
1049
1050class ClassificationChoicesView : public BView
1051{
1052public:
1053  /* Constructor and destructor. */
1054  ClassificationChoicesView (BRect FrameRect,
1055    const char *FileName, int NumberOfFiles);
1056
1057  /* BeOS virtual functions. */
1058  virtual void AttachedToWindow ();
1059  virtual void GetPreferredSize (float *width, float *height);
1060
1061private:
1062  /* Member variables. */
1063  const char *m_FileName;
1064  int         m_NumberOfFiles;
1065  float       m_PreferredBottomY;
1066};
1067
1068
1069
1070/******************************************************************************
1071 * Due to deadlock problems with the BApplication posting scripting messages to
1072 * itself, we need to add a second Looper.  Its job is to just to convert
1073 * command line arguments and arguments from the Tracker (refs received) into a
1074 * series of scripting commands sent to the main BApplication.  It also prints
1075 * out the replies received (to stdout for command line replies).  An instance
1076 * of this class will be created and run by the main() function, and shut down
1077 * by it too.
1078 */
1079
1080class CommanderLooper : public BLooper
1081{
1082public:
1083  CommanderLooper ();
1084  ~CommanderLooper ();
1085  virtual void MessageReceived (BMessage *MessagePntr);
1086
1087  void CommandArguments (int argc, char **argv);
1088  void CommandReferences (BMessage *MessagePntr,
1089    bool BulkMode = false,
1090    ClassificationTypes BulkClassification = CL_GENUINE);
1091  bool IsBusy ();
1092
1093private:
1094  void ProcessArgs (BMessage *MessagePntr);
1095  void ProcessRefs (BMessage *MessagePntr);
1096
1097  static const uint32 MSG_COMMAND_ARGUMENTS = 'CArg';
1098  static const uint32 MSG_COMMAND_FILE_REFS = 'CRef';
1099
1100  bool m_IsBusy;
1101};
1102
1103
1104
1105/******************************************************************************
1106 * This view contains the various buttons and other controls for setting
1107 * configuration options and displaying the state of the database (but not the
1108 * actual list of words).  It will appear in the top half of the
1109 * DatabaseWindow.
1110 */
1111
1112class ControlsView : public BView
1113{
1114public:
1115  /* Constructor and destructor. */
1116  ControlsView (BRect NewBounds);
1117  ~ControlsView ();
1118
1119  /* BeOS virtual functions. */
1120  virtual void AttachedToWindow ();
1121  virtual void FrameResized (float Width, float Height);
1122  virtual void MessageReceived (BMessage *MessagePntr);
1123  virtual void Pulse ();
1124
1125private:
1126  /* Various message codes for various buttons etc. */
1127  static const uint32 MSG_BROWSE_BUTTON = 'Brws';
1128  static const uint32 MSG_DATABASE_NAME = 'DbNm';
1129  static const uint32 MSG_ESTIMATE_BUTTON = 'Estm';
1130  static const uint32 MSG_ESTIMATE_FILE_REFS = 'ERef';
1131  static const uint32 MSG_IGNORE_CLASSIFICATION = 'IPCl';
1132  static const uint32 MSG_PURGE_AGE = 'PuAg';
1133  static const uint32 MSG_PURGE_BUTTON = 'Purg';
1134  static const uint32 MSG_PURGE_POPULARITY = 'PuPo';
1135  static const uint32 MSG_SERVER_MODE = 'SrvM';
1136
1137  /* Our member functions. */
1138  void BrowseForDatabaseFile ();
1139  void BrowseForFileToEstimate ();
1140  void PollServerForChanges ();
1141
1142  /* Member variables. */
1143  BButton        *m_AboutButtonPntr;
1144  BButton        *m_AddExampleButtonPntr;
1145  BButton        *m_BrowseButtonPntr;
1146  BFilePanel     *m_BrowseFilePanelPntr;
1147  BButton        *m_CreateDatabaseButtonPntr;
1148  char            m_DatabaseFileNameCachedValue [PATH_MAX];
1149  BTextControl   *m_DatabaseFileNameTextboxPntr;
1150  bool            m_DatabaseLoadDone;
1151  BButton        *m_EstimateSpamButtonPntr;
1152  BFilePanel     *m_EstimateSpamFilePanelPntr;
1153  uint32          m_GenuineCountCachedValue;
1154  BTextControl   *m_GenuineCountTextboxPntr;
1155  bool            m_IgnorePreviousClassCachedValue;
1156  BCheckBox      *m_IgnorePreviousClassCheckboxPntr;
1157  BButton        *m_InstallThingsButtonPntr;
1158  uint32          m_PurgeAgeCachedValue;
1159  BTextControl   *m_PurgeAgeTextboxPntr;
1160  BButton        *m_PurgeButtonPntr;
1161  uint32          m_PurgePopularityCachedValue;
1162  BTextControl   *m_PurgePopularityTextboxPntr;
1163  BButton        *m_ResetToDefaultsButtonPntr;
1164  ScoringModes    m_ScoringModeCachedValue;
1165  BMenuBar       *m_ScoringModeMenuBarPntr;
1166  BPopUpMenu     *m_ScoringModePopUpMenuPntr;
1167  bool            m_ServerModeCachedValue;
1168  BCheckBox      *m_ServerModeCheckboxPntr;
1169  uint32          m_SpamCountCachedValue;
1170  BTextControl   *m_SpamCountTextboxPntr;
1171  bigtime_t       m_TimeOfLastPoll;
1172  TokenizeModes   m_TokenizeModeCachedValue;
1173  BMenuBar       *m_TokenizeModeMenuBarPntr;
1174  BPopUpMenu     *m_TokenizeModePopUpMenuPntr;
1175  uint32          m_WordCountCachedValue;
1176  BTextControl   *m_WordCountTextboxPntr;
1177};
1178
1179
1180/* Various message codes for various buttons etc. */
1181static const uint32 MSG_LINE_DOWN = 'LnDn';
1182static const uint32 MSG_LINE_UP = 'LnUp';
1183static const uint32 MSG_PAGE_DOWN = 'PgDn';
1184static const uint32 MSG_PAGE_UP = 'PgUp';
1185
1186/******************************************************************************
1187 * This view contains the list of words.  It displays as many as can fit in the
1188 * view rectangle, starting at a specified word (so it can simulate scrolling).
1189 * Usually it will appear in the bottom half of the DatabaseWindow.
1190 */
1191
1192class WordsView : public BView
1193{
1194public:
1195  /* Constructor and destructor. */
1196  WordsView (BRect NewBounds);
1197
1198  /* BeOS virtual functions. */
1199  virtual void AttachedToWindow ();
1200  virtual void Draw (BRect UpdateRect);
1201  virtual void KeyDown (const char *BufferPntr, int32 NumBytes);
1202  virtual void MakeFocus (bool Focused);
1203  virtual void MessageReceived (BMessage *MessagePntr);
1204  virtual void MouseDown (BPoint point);
1205  virtual void Pulse ();
1206
1207private:
1208  /* Our member functions. */
1209  void MoveTextUpOrDown (uint32 MovementType);
1210  void RefsDroppedHere (BMessage *MessagePntr);
1211
1212  /* Member variables. */
1213  BPictureButton *m_ArrowLineDownPntr;
1214  BPictureButton *m_ArrowLineUpPntr;
1215  BPictureButton *m_ArrowPageDownPntr;
1216  BPictureButton *m_ArrowPageUpPntr;
1217    /* Various buttons for controlling scrolling, since we can't use a scroll
1218    bar.  To make them less obvious, their background view colour needs to be
1219    changed whenever the main view's colour changes. */
1220
1221  float m_AscentHeight;
1222    /* The ascent height for the font used to draw words.  Height from the top
1223    of the highest letter to the base line (which is near the middle bottom of
1224    the letters, the line where you would align your writing of the text by
1225    hand, all letters have part above, some also have descenders below this
1226    line). */
1227
1228  rgb_color m_BackgroundColour;
1229    /* The current background colour.  Changes when the focus changes. */
1230
1231  uint32 m_CachedTotalGenuineMessages;
1232  uint32 m_CachedTotalSpamMessages;
1233  uint32 m_CachedWordCount;
1234    /* These are cached copies of the similar values in the BApplication.  They
1235    reflect what's currently displayed.  If they are different than the values
1236    from the BApplication then the polling loop will try to redraw the display.
1237    They get set to the values actually used during drawing when drawing is
1238    successful. */
1239
1240  char m_FirstDisplayedWord [g_MaxWordLength + 1];
1241    /* The scrolling display starts at this word.  Since we can't use index
1242    numbers (word[12345] for example), we use the word itself.  The scroll
1243    buttons set this to the next or previous word in the database.  Typing by
1244    the user when the view has the focus will also change this starting word.
1245    */
1246
1247  rgb_color m_FocusedColour;
1248    /* The colour to use for focused mode (typing by the user is received by
1249    our view). */
1250
1251  bigtime_t m_LastTimeAKeyWasPressed;
1252    /* Records the time when a key was last pressed.  Used for determining when
1253    the user has stopped typing a batch of letters. */
1254
1255  float m_LineHeight;
1256    /* Height of a line of text in the font used for the word display.
1257    Includes the height of the letters plus a bit of extra space for between
1258    the lines (called leading). */
1259
1260  BFont m_TextFont;
1261    /* The font used to draw the text in the window. */
1262
1263  float m_TextHeight;
1264    /* Maximum total height of the letters in the text, includes the part above
1265    the baseline and the part below.  Doesn't include the sliver of space
1266    between lines. */
1267
1268  rgb_color m_UnfocusedColour;
1269    /* The colour to use for unfocused mode, when user typing isn't active. */
1270};
1271
1272
1273
1274/******************************************************************************
1275 * The BWindow class for this program.  It displays the database in real time,
1276 * and has various buttons and gadgets in the top half for changing settings
1277 * (live changes, no OK button, and they reflect changes done by other programs
1278 * using the server too).  The bottom half is a scrolling view listing all the
1279 * words in the database.  A simple graphic blotch behind each word shows
1280 * whether the word is strongly or weakly related to spam or genuine messages.
1281 * Most operations go through the scripting message system, but it also peeks
1282 * at the BApplication data for examining simple things and when redrawing the
1283 * list of words.
1284 */
1285
1286class DatabaseWindow : public BWindow
1287{
1288public:
1289  /* Constructor and destructor. */
1290  DatabaseWindow ();
1291
1292  /* BeOS virtual functions. */
1293  virtual void MessageReceived (BMessage *MessagePntr);
1294  virtual bool QuitRequested ();
1295
1296private:
1297  /* Member variables. */
1298  ControlsView *m_ControlsViewPntr;
1299  WordsView    *m_WordsViewPntr;
1300};
1301
1302
1303
1304/******************************************************************************
1305 * ABSApp is the BApplication class for this program.  This handles messages
1306 * from the outside world (requests to load a database, or to add files to the
1307 * collection).  It responds to command line arguments (if you start up the
1308 * program a second time, the system will just send the arguments to the
1309 * existing running program).  It responds to scripting messages.  And it
1310 * responds to messages from the window.  Its thread does the main work of
1311 * updating the database and reading / writing files.
1312 */
1313
1314class ABSApp : public BApplication
1315{
1316public:
1317  /* Constructor and destructor. */
1318  ABSApp ();
1319  ~ABSApp ();
1320
1321  /* BeOS virtual functions. */
1322  virtual void AboutRequested ();
1323  virtual void ArgvReceived (int32 argc, char **argv);
1324  virtual status_t GetSupportedSuites (BMessage *MessagePntr);
1325  virtual void MessageReceived (BMessage *MessagePntr);
1326  virtual void Pulse ();
1327  virtual bool QuitRequested ();
1328  virtual void ReadyToRun ();
1329  virtual void RefsReceived (BMessage *MessagePntr);
1330  virtual BHandler *ResolveSpecifier (BMessage *MessagePntr, int32 Index,
1331    BMessage *SpecifierMsgPntr, int32 SpecificationKind, const char *Property);
1332
1333private:
1334  /* Our member functions. */
1335  status_t AddFileToDatabase (ClassificationTypes IsSpamOrWhat,
1336    const char *FileName, char *ErrorMessage);
1337  status_t AddPositionIOToDatabase (ClassificationTypes IsSpamOrWhat,
1338    BPositionIO *MessageIOPntr, const char *OptionalFileName,
1339    char *ErrorMessage);
1340  status_t AddStringToDatabase (ClassificationTypes IsSpamOrWhat,
1341    const char *String, char *ErrorMessage);
1342  void AddWordsToSet (const char *InputString, size_t NumberOfBytes,
1343    char PrefixCharacter, set<string> &WordSet);
1344  status_t CreateDatabaseFile (char *ErrorMessage);
1345  void DefaultSettings ();
1346  status_t DeleteDatabaseFile (char *ErrorMessage);
1347  status_t EvaluateFile (const char *PathName, BMessage *ReplyMessagePntr,
1348    char *ErrorMessage);
1349  status_t EvaluatePositionIO (BPositionIO *PositionIOPntr,
1350    const char *OptionalFileName, BMessage *ReplyMessagePntr,
1351    char *ErrorMessage);
1352  status_t EvaluateString (const char *BufferPntr, ssize_t BufferSize,
1353    BMessage *ReplyMessagePntr, char *ErrorMessage);
1354  status_t GetWordsFromPositionIO (BPositionIO *PositionIOPntr,
1355    const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1356  status_t InstallThings (char *ErrorMessage);
1357  status_t LoadDatabaseIfNeeded (char *ErrorMessage);
1358  status_t LoadSaveDatabase (bool DoLoad, char *ErrorMessage);
1359public:
1360  status_t LoadSaveSettings (bool DoLoad);
1361private:
1362  status_t MakeBackup (char *ErrorMessage);
1363  void MakeDatabaseEmpty ();
1364  void ProcessScriptingMessage (BMessage *MessagePntr,
1365    struct property_info *PropInfoPntr);
1366  status_t PurgeOldWords (char *ErrorMessage);
1367  status_t RecursivelyTokenizeMailComponent (
1368    BMailComponent *ComponentPntr, const char *OptionalFileName,
1369    set<string> &WordSet, char *ErrorMessage,
1370    int RecursionLevel, int MaxRecursionLevel);
1371  status_t SaveDatabaseIfNeeded (char *ErrorMessage);
1372  status_t TokenizeParts (BPositionIO *PositionIOPntr,
1373    const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1374  status_t TokenizeWhole (BPositionIO *PositionIOPntr,
1375    const char *OptionalFileName, set<string> &WordSet, char *ErrorMessage);
1376
1377public:
1378  /* Member variables.  Many are read by the window thread to see if it needs
1379  updating, and to draw the words.  However, the other threads will lock the
1380  BApplication or using scripting commands if they want to make changes. */
1381
1382  bool m_DatabaseHasChanged;
1383    /* Set to TRUE when the in-memory database (stored in m_WordMap) has
1384    changed and is different from the on-disk database file.  When the
1385    application exits, the database will be written out if it has changed. */
1386
1387  BString m_DatabaseFileName;
1388    /* The absolute path name to use for the database file on disk. */
1389
1390  bool m_IgnorePreviousClassification;
1391    /* If TRUE then the previous classification of a message (stored in an
1392    attribute on the message file) will be ignored, and the message will be
1393    added to the requested spam/genuine list.  If this is FALSE then the spam
1394    won't be added to the list if it has already been classified as specified,
1395    but if it was mis-classified, it will be removed from the old list and
1396    added to the new list. */
1397
1398  uint32 m_OldestAge;
1399    /* The age of the oldest word.  This will be the smallest age number in the
1400    database.  Mostly useful for scaling graphics representing age in the word
1401    display.  If the oldest word is no longer the oldest, this variable won't
1402    get immediately updated since it would take a lot of effort to find the
1403    next older age.  Since it's only used for display, we'll let it be slightly
1404    incorrect.  The next database load or purge will fix it. */
1405
1406  uint32 m_PurgeAge;
1407    /* When purging old words, they have to be at least this old to be eligible
1408    for deletion.  Age is measured as the number of e-mails added to the
1409    database since the word was last updated in the database.  Zero means all
1410    words are old. */
1411
1412  uint32 m_PurgePopularity;
1413    /* When purging old words, they have to be less than or equal to this
1414    popularity limit to be eligible for deletion.  Popularity is measured as
1415    the number of messages (spam and genuine) which have the word.  Zero means
1416    no words. */
1417
1418  ScoringModes m_ScoringMode;
1419    /* Controls how to combine the word probabilities into an overall score.
1420    See the PN_SCORING_MODE comments for details. */
1421
1422  BPath m_SettingsDirectoryPath;
1423    /* The constructor initialises this to the settings directory path.  It
1424    never changes after that. */
1425
1426  bool m_SettingsHaveChanged;
1427    /* Set to TRUE when the settings are changed (different than the ones which
1428    were loaded).  When the application exits, the settings will be written out
1429    if they have changed. */
1430
1431  double m_SmallestUseableDouble;
1432    /* When multiplying fractional numbers together, avoid using numbers
1433    smaller than this because the double exponent range is close to being
1434    exhausted.  The IEEE STANDARD 754 floating-point arithmetic (used on the
1435    Intel i8087 and later math processors) has 64 bit numbers with 53 bits of
1436    mantissa, giving it an underflow starting at 0.5**1022 = 2.2e-308 where it
1437    rounds off to the nearest multiple of 0.5**1074 = 4.9e-324. */
1438
1439  TokenizeModes m_TokenizeMode;
1440    /* Controls how to convert the raw message text into words.  See the
1441    PN_TOKENIZE_MODE comments for details. */
1442
1443  uint32 m_TotalGenuineMessages;
1444    /* Number of genuine messages which are in the database. */
1445
1446  uint32 m_TotalSpamMessages;
1447    /* Number of spam messages which are in the database. */
1448
1449  uint32 m_WordCount;
1450    /* The number of words currently in the database.  Stored separately as a
1451    member variable to avoid having to call m_WordMap.size() all the time,
1452    which other threads can't do while the database is being updated (but they
1453    can look at the word count variable). */
1454
1455  StatisticsMap m_WordMap;
1456    /* The in-memory data structure holding the set of words and their
1457    associated statistics.  When the database isn't in use, it is an empty
1458    collection.  You should lock the BApplication if you are using the word
1459    collection (reading or writing) from another thread. */
1460};
1461
1462
1463
1464/******************************************************************************
1465 * Global utility function to display an error message and return.  The message
1466 * part describes the error, and if ErrorNumber is non-zero, gets the string
1467 * ", error code $X (standard description)." appended to it.  If the message
1468 * is NULL then it gets defaulted to "Something went wrong".  The title part
1469 * doesn't get displayed (no title bar in the dialog box, but you can see it in
1470 * the debugger as the window thread name), and defaults to "Error Message" if
1471 * you didn't specify one.  If running in command line mode, the error gets
1472 * printed to stderr rather than showing up in a dialog box.
1473 */
1474
1475static void
1476DisplayErrorMessage (
1477  const char *MessageString = NULL,
1478  int ErrorNumber = 0,
1479  const char *TitleString = NULL)
1480{
1481  BAlert *AlertPntr;
1482  char ErrorBuffer [PATH_MAX + 1500];
1483
1484  if (TitleString == NULL)
1485    TitleString = "SpamDBM Error Message";
1486
1487  if (MessageString == NULL)
1488  {
1489    if (ErrorNumber == 0)
1490      MessageString = "No error, no message, why bother?";
1491    else
1492      MessageString = "Something went wrong";
1493  }
1494
1495  if (ErrorNumber != 0)
1496  {
1497    sprintf (ErrorBuffer, "%s, error code $%X/%d (%s) has occured.",
1498      MessageString, ErrorNumber, ErrorNumber, strerror (ErrorNumber));
1499    MessageString = ErrorBuffer;
1500  }
1501
1502  if (g_CommandLineMode || g_ServerMode)
1503    cerr << TitleString << ": " << MessageString << endl;
1504  else
1505  {
1506    AlertPntr = new BAlert (TitleString, MessageString,
1507      "Acknowledge", NULL, NULL, B_WIDTH_AS_USUAL, B_STOP_ALERT);
1508    if (AlertPntr != NULL) {
1509      AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1510      AlertPntr->Go ();
1511    }
1512  }
1513}
1514
1515
1516
1517/******************************************************************************
1518 * Word wrap a long line of text into shorter 79 column lines and print the
1519 * result on the given output stream.
1520 */
1521
1522static void
1523WrapTextToStream (ostream& OutputStream, const char *TextPntr)
1524{
1525  const int LineLength = 79;
1526  char     *StringPntr;
1527  char      TempString [LineLength+1];
1528
1529  TempString[LineLength] = 0; /* Only needs to be done once. */
1530
1531  while (*TextPntr != 0)
1532  {
1533    while (isspace (*TextPntr))
1534      TextPntr++; /* Skip leading spaces. */
1535    if (*TextPntr == 0)
1536      break; /* It was all spaces, don't print any more. */
1537
1538    strncpy (TempString, TextPntr, LineLength);
1539
1540    /* Advance StringPntr to the end of the temp string, partly to see how long
1541    it is (rather than doing strlen). */
1542
1543    StringPntr = TempString;
1544    while (*StringPntr != 0)
1545      StringPntr++;
1546
1547    if (StringPntr - TempString < LineLength)
1548    {
1549      /* This line fits completely. */
1550      OutputStream << TempString << endl;
1551      TextPntr += StringPntr - TempString;
1552      continue;
1553    }
1554
1555    /* Advance StringPntr to the last space in the temp string. */
1556
1557    while (StringPntr > TempString)
1558    {
1559      if (isspace (*StringPntr))
1560        break; /* Found the trailing space. */
1561      else /* Go backwards, looking for the trailing space. */
1562        StringPntr--;
1563    }
1564
1565    /* Remove more trailing spaces at the end of the line, in case there were
1566    several spaces in a row. */
1567
1568    while (StringPntr > TempString && isspace (StringPntr[-1]))
1569      StringPntr--;
1570
1571    /* Print the line of text and advance the text pointer too. */
1572
1573    if (StringPntr == TempString)
1574    {
1575      /* This line has no spaces, don't wrap it, just split off a chunk. */
1576      OutputStream << TempString << endl;
1577      TextPntr += strlen (TempString);
1578      continue;
1579    }
1580
1581    *StringPntr = 0; /* Cut off after the first trailing space. */
1582    OutputStream << TempString << endl;
1583    TextPntr += StringPntr - TempString;
1584  }
1585}
1586
1587
1588
1589/******************************************************************************
1590 * Print the usage info to the stream.  Includes a list of all commands.
1591 */
1592ostream& PrintUsage (ostream& OutputStream);
1593
1594ostream& PrintUsage (ostream& OutputStream)
1595{
1596  struct property_info *PropInfoPntr;
1597
1598  OutputStream << "\nSpamDBM - A Spam Database Manager\n";
1599  OutputStream << "Copyright �� 2002 by Alexander G. M. Smith.  ";
1600  OutputStream << "Released to the public domain.\n\n";
1601  WrapTextToStream (OutputStream, "Compiled on " __DATE__ " at " __TIME__
1602".  $Id: spamdbm.cpp 30630 2009-05-05 01:31:01Z bga $  $HeadURL: http://svn.haiku-os.org/haiku/haiku/trunk/src/bin/mail_utils/spamdbm.cpp $");
1603  OutputStream << "\n"
1604"This is a program for classifying e-mail messages as spam (junk mail which\n"
1605"you don't want to read) and regular genuine messages.  It can learn what's\n"
1606"spam and what's genuine.  You just give it a bunch of spam messages and a\n"
1607"bunch of non-spam ones.  It uses them to make a list of the words from the\n"
1608"messages with the probability that each word is from a spam message or from\n"
1609"a genuine message.  Later on, it can use those probabilities to classify\n"
1610"new messages as spam or not spam.  If the classifier stops working well\n"
1611"(because the spammers have changed their writing style and vocabulary, or\n"
1612"your regular correspondants are writing like spammers), you can use this\n"
1613"program to update the list of words to identify the new messages\n"
1614"correctly.\n"
1615"\n"
1616"The original idea was from Paul Graham's algorithm, which has an excellent\n"
1617"writeup at: http://www.paulgraham.com/spam.html\n"
1618"\n"
1619"Gary Robinson came up with the improved algorithm, which you can read about at:\n"
1620"http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n"
1621"\n"
1622"Then he, Tim Peters and the SpamBayes mailing list developed the Chi-Squared\n"
1623"test, see http://mail.python.org/pipermail/spambayes/2002-October/001036.html\n"
1624"for one of the earlier messages leading from the central limit theorem to\n"
1625"the current chi-squared scoring method.\n"
1626"\n"
1627"Thanks go to Isaac Yonemoto for providing a better icon, which we can\n"
1628"unfortunately no longer use, since the Hormel company wants people to\n"
1629"avoid associating their meat product with junk e-mail.\n"
1630"\n"
1631"Tokenising code updated in 2005 to use some of the tricks that SpamBayes\n"
1632"uses to extract words from messages.  In particular, HTML is now handled.\n"
1633"\n"
1634"Usage: Specify the operation as the first argument followed by more\n"
1635"information as appropriate.  The program's configuration will affect the\n"
1636"actual operation (things like the name of the database file to use, or\n"
1637"whether it should allow non-email messages to be added).  In command line\n"
1638"mode it will do the operation and exit.  In GUI/server mode a command line\n"
1639"invocation will just send the command to the running server.  You can also\n"
1640"use BeOS scripting (see the \"Hey\" command which you can get from\n"
1641"http://www.bebits.com/app/2042 ) to control the Spam server.  And finally,\n"
1642"there's also a GUI interface which shows up if you start it without any\n"
1643"command line arguments.\n"
1644"\n"
1645"Commands:\n"
1646"\n"
1647"Quit\n"
1648"Stop the program.  Useful if it's running as a server.\n"
1649"\n";
1650
1651  /* Go through all our scripting commands and add a description of each one to
1652  the usage text. */
1653
1654  for (PropInfoPntr = g_ScriptingPropertyList + 0;
1655  PropInfoPntr->name != 0;
1656  PropInfoPntr++)
1657  {
1658    switch (PropInfoPntr->commands[0])
1659    {
1660      case B_GET_PROPERTY:
1661        OutputStream << "Get " << PropInfoPntr->name << endl;
1662        break;
1663
1664      case B_SET_PROPERTY:
1665        OutputStream << "Set " << PropInfoPntr->name << " NewValue" << endl;
1666        break;
1667
1668      case B_COUNT_PROPERTIES:
1669        OutputStream << "Count " << PropInfoPntr->name << endl;
1670        break;
1671
1672      case B_CREATE_PROPERTY:
1673        OutputStream << "Create " << PropInfoPntr->name << endl;
1674        break;
1675
1676      case B_DELETE_PROPERTY:
1677        OutputStream << "Delete " << PropInfoPntr->name << endl;
1678        break;
1679
1680      case B_EXECUTE_PROPERTY:
1681        OutputStream << PropInfoPntr->name << endl;
1682        break;
1683
1684      default:
1685        OutputStream << "Buggy Command: " << PropInfoPntr->name << endl;
1686        break;
1687    }
1688    WrapTextToStream (OutputStream, (char *)PropInfoPntr->usage);
1689    OutputStream << endl;
1690  }
1691
1692  return OutputStream;
1693}
1694
1695
1696
1697/******************************************************************************
1698 * A utility function to send a command to the application, will return after a
1699 * short delay if the application is busy (doesn't wait for it to be executed).
1700 * The reply from the application is also thrown away.  It used to be an
1701 * overloaded function, but the system couldn't distinguish between bool and
1702 * int, so now it has slightly different names depending on the arguments.
1703 */
1704
1705static void
1706SubmitCommand (BMessage& CommandMessage)
1707{
1708  status_t ErrorCode;
1709
1710  ErrorCode = be_app_messenger.SendMessage (&CommandMessage,
1711    be_app_messenger /* reply messenger, throw away the reply */,
1712    1000000 /* delivery timeout */);
1713
1714  if (ErrorCode != B_OK)
1715    cerr << "SubmitCommand failed to send a command, code " <<
1716    ErrorCode << " (" << strerror (ErrorCode) << ")." << endl;
1717}
1718
1719
1720static void
1721SubmitCommandString (
1722  PropertyNumbers Property,
1723  uint32 CommandCode,
1724  const char *StringArgument = NULL)
1725{
1726  BMessage CommandMessage (CommandCode);
1727
1728  if (Property < 0 || Property >= PN_MAX)
1729  {
1730    DisplayErrorMessage ("SubmitCommandString bug.");
1731    return;
1732  }
1733  CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1734  if (StringArgument != NULL)
1735    CommandMessage.AddString (g_DataName, StringArgument);
1736  SubmitCommand (CommandMessage);
1737}
1738
1739
1740static void
1741SubmitCommandInt32 (
1742  PropertyNumbers Property,
1743  uint32 CommandCode,
1744  int32 Int32Argument)
1745{
1746  BMessage CommandMessage (CommandCode);
1747
1748  if (Property < 0 || Property >= PN_MAX)
1749  {
1750    DisplayErrorMessage ("SubmitCommandInt32 bug.");
1751    return;
1752  }
1753  CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1754  CommandMessage.AddInt32 (g_DataName, Int32Argument);
1755  SubmitCommand (CommandMessage);
1756}
1757
1758
1759static void
1760SubmitCommandBool (
1761  PropertyNumbers Property,
1762  uint32 CommandCode,
1763  bool BoolArgument)
1764{
1765  BMessage CommandMessage (CommandCode);
1766
1767  if (Property < 0 || Property >= PN_MAX)
1768  {
1769    DisplayErrorMessage ("SubmitCommandBool bug.");
1770    return;
1771  }
1772  CommandMessage.AddSpecifier (g_PropertyNames [Property]);
1773  CommandMessage.AddBool (g_DataName, BoolArgument);
1774  SubmitCommand (CommandMessage);
1775}
1776
1777
1778
1779/******************************************************************************
1780 * A utility function which will estimate the spaminess of file(s), not
1781 * callable from the application thread since it sends a scripting command to
1782 * the application and waits for results.  For each file there will be an entry
1783 * reference in the message.  For each of those, run it through the spam
1784 * estimator and display a box with the results.  This function is used both by
1785 * the file requestor and by dragging and dropping into the middle of the words
1786 * view.
1787 */
1788
1789static void
1790EstimateRefFilesAndDisplay (BMessage *MessagePntr)
1791{
1792  BAlert     *AlertPntr;
1793  BEntry      Entry;
1794  entry_ref   EntryRef;
1795  status_t    ErrorCode;
1796  int         i, j;
1797  BPath       Path;
1798  BMessage    ReplyMessage;
1799  BMessage    ScriptingMessage;
1800  const char *StringPntr;
1801  float       TempFloat;
1802  int32       TempInt32;
1803  char        TempString [PATH_MAX + 1024 +
1804                g_MaxInterestingWords * (g_MaxWordLength + 16)];
1805
1806  for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
1807  {
1808    /* See if the entry is a valid file or directory or other thing. */
1809
1810    ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
1811    if (ErrorCode != B_OK || !Entry.Exists () || Entry.GetPath (&Path) != B_OK)
1812      continue;
1813
1814    /* Evaluate the spaminess of the file. */
1815
1816    ScriptingMessage.MakeEmpty ();
1817    ScriptingMessage.what = B_SET_PROPERTY;
1818    ScriptingMessage.AddSpecifier (g_PropertyNames[PN_EVALUATE]);
1819    ScriptingMessage.AddString (g_DataName, Path.Path ());
1820
1821    if (be_app_messenger.SendMessage (&ScriptingMessage,&ReplyMessage) != B_OK)
1822      break; /* App has died or something is wrong. */
1823
1824    if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
1825    TempInt32 != B_OK)
1826      break; /* Error messages will be displayed elsewhere. */
1827
1828    ReplyMessage.FindFloat (g_ResultName, &TempFloat);
1829    sprintf (TempString, "%f spam ratio for \"%s\".\nThe top words are:",
1830      (double) TempFloat, Path.Path ());
1831
1832    for (j = 0; j < 20 /* Don't print too many! */; j++)
1833    {
1834      if (ReplyMessage.FindString ("words", j, &StringPntr) != B_OK ||
1835      ReplyMessage.FindFloat ("ratios", j, &TempFloat) != B_OK)
1836        break;
1837
1838      sprintf (TempString + strlen (TempString), "\n%s / %f",
1839        StringPntr, TempFloat);
1840    }
1841    if (j >= 20 && j < g_MaxInterestingWords)
1842      sprintf (TempString + strlen (TempString), "\nAnd up to %d more words.",
1843        g_MaxInterestingWords - j);
1844
1845    AlertPntr = new BAlert ("Estimate", TempString, "OK");
1846    if (AlertPntr != NULL) {
1847      AlertPntr->SetFlags(AlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
1848      AlertPntr->Go ();
1849    }
1850  }
1851}
1852
1853
1854
1855/******************************************************************************
1856 * A utility function from the http://sourceforge.net/projects/spambayes
1857 * SpamBayes project.  Return prob(chisq >= x2, with v degrees of freedom).  It
1858 * computes the probability that the chi-squared value (a kind of normalized
1859 * error measurement), with v degrees of freedom, would be larger than a given
1860 * number (x2; chi is the Greek letter X thus x2).  So you can tell if the
1861 * error is really unusual (the returned probability is near zero meaning that
1862 * your measured error number is kind of large - actual chi-squared is rarely
1863 * above that number merely due to random effects), or if it happens often
1864 * (usually if the probability is over 5% then it's within 3 standard
1865 * deviations - meaning that chi-squared goes over your number fairly often due
1866 * merely to random effects).  v must be even for this calculation to work.
1867 */
1868
1869static double ChiSquaredProbability (double x2, int v)
1870{
1871  int    halfV = v / 2;
1872  int    i;
1873  double m;
1874  double sum;
1875  double term;
1876
1877  if (v & 1)
1878    return -1.0; /* Out of range return value as a hint v is odd. */
1879
1880  /* If x2 is very large, exp(-m) will underflow to 0. */
1881  m = x2 / 2.0;
1882  sum = term = exp (-m);
1883  for (i = 1; i < halfV; i++)
1884  {
1885    term *= m / i;
1886    sum += term;
1887  }
1888
1889  /* With small x2 and large v, accumulated roundoff error, plus error in the
1890  platform exp(), can cause this to spill a few ULP above 1.0.  For example,
1891  ChiSquaredProbability(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
1892  point.  Returning a value even a teensy bit over 1.0 is no good. */
1893
1894  if (sum > 1.0)
1895    return 1.0;
1896  return sum;
1897}
1898
1899
1900
1901/******************************************************************************
1902 * A utility function to remove the "[Spam 99.9%] " from in front of the
1903 * MAIL:subject attribute of a file.
1904 */
1905
1906static status_t RemoveSpamPrefixFromSubjectAttribute (BNode *BNodePntr)
1907{
1908  status_t    ErrorCode;
1909  const char *MailSubjectName = "MAIL:subject";
1910  char       *StringPntr;
1911  char        SubjectString [2000];
1912
1913  ErrorCode = BNodePntr->ReadAttr (MailSubjectName,
1914    B_STRING_TYPE, 0 /* offset */, SubjectString,
1915    sizeof (SubjectString) - 1);
1916  if (ErrorCode <= 0)
1917    return 0; /* The attribute isn't there so we don't care. */
1918  if (ErrorCode >= (int) sizeof (SubjectString) - 1)
1919    return 0; /* Can't handle subjects which are too long. */
1920
1921  SubjectString [ErrorCode] = 0;
1922  ErrorCode = 0; /* So do-nothing exit returns zero. */
1923  if (strncmp (SubjectString, "[Spam ", 6) == 0)
1924  {
1925    for (StringPntr = SubjectString;
1926    *StringPntr != 0 && *StringPntr != ']'; StringPntr++)
1927      ; /* No body in this for loop. */
1928    if (StringPntr[0] == ']' && StringPntr[1] == ' ')
1929    {
1930      ErrorCode = BNodePntr->RemoveAttr (MailSubjectName);
1931      ErrorCode = BNodePntr->WriteAttr (MailSubjectName,
1932        B_STRING_TYPE, 0 /* offset */,
1933        StringPntr + 2, strlen (StringPntr + 2) + 1);
1934      if (ErrorCode > 0)
1935        ErrorCode = 0;
1936    }
1937  }
1938
1939  return ErrorCode;
1940}
1941
1942
1943
1944/******************************************************************************
1945 * The tokenizing functions.  To make tokenization of the text easier to
1946 * understand, it is broken up into several passes.  Each pass goes over the
1947 * text (can include NUL bytes) and extracts all the words it can recognise
1948 * (can be none).  The extracted words are added to the WordSet, with the
1949 * PrefixCharacter prepended (zero if none) so we can distinguish between words
1950 * found in headers and in the text body.  It also modifies the input text
1951 * buffer in-place to change the text that the next pass will see (blanking out
1952 * words that it wants to delete, but not inserting much new text since the
1953 * buffer can't be enlarged).  They all return the number of bytes remaining in
1954 * InputString after it has been modified to be input for the next pass.
1955 * Returns zero if it has exhausted the possibility of getting more words, or
1956 * if something goes wrong.
1957 */
1958
1959static size_t TokenizerPassLowerCase (
1960  char *BufferPntr,
1961  size_t NumberOfBytes)
1962{
1963  char *EndOfStringPntr;
1964
1965  EndOfStringPntr = BufferPntr + NumberOfBytes;
1966
1967  while (BufferPntr < EndOfStringPntr)
1968  {
1969    /* Do our own lower case conversion; tolower () has problems with UTF-8
1970    characters that have the high bit set. */
1971
1972    if (*BufferPntr >= 'A' && *BufferPntr <= 'Z')
1973      *BufferPntr = *BufferPntr + ('a' - 'A');
1974    BufferPntr++;
1975  }
1976  return NumberOfBytes;
1977}
1978
1979
1980/* A utility function for some commonly repeated code.  If this was Modula-2,
1981we could use a nested procedure.  But it's not.  Adds the given word to the set
1982of words, checking for maximum word length and prepending the prefix to the
1983word, which gets modified by this function to reflect the word actually added
1984to the set. */
1985
1986static void
1987AddWordAndPrefixToSet (
1988  string &Word,
1989  const char *PrefixString,
1990  set<string> &WordSet)
1991{
1992  if (Word.empty ())
1993    return;
1994
1995  if (Word.size () > g_MaxWordLength)
1996    Word.resize (g_MaxWordLength);
1997  Word.insert (0, PrefixString);
1998  WordSet.insert (Word);
1999}
2000
2001
2002/* Hunt through the text for various URLs and extract the components as
2003separate words.  Doesn't affect the text in the buffer.  Looks for
2004protocol://user:password@computer:port/path?query=key#anchor strings.  Also
2005www.blah strings are detected and broken down.  Doesn't do HREF="" strings
2006where the string has a relative path (no host computer name).  Assumes the
2007input buffer is already in lower case. */
2008
2009static size_t TokenizerPassExtractURLs (
2010  char *BufferPntr,
2011  size_t NumberOfBytes,
2012  char PrefixCharacter,
2013  set<string> &WordSet)
2014{
2015  char   *AtSignStringPntr;
2016  char   *HostStringPntr;
2017  char   *InputStringEndPntr;
2018  char   *InputStringPntr;
2019  char   *OptionsStringPntr;
2020  char   *PathStringPntr;
2021  char    PrefixString [2];
2022  char   *ProtocolStringPntr;
2023  string  Word;
2024
2025  InputStringPntr = BufferPntr;
2026  InputStringEndPntr = BufferPntr + NumberOfBytes;
2027  PrefixString [0] = PrefixCharacter;
2028  PrefixString [1] = 0;
2029
2030  while (InputStringPntr < InputStringEndPntr - 4)
2031  {
2032    HostStringPntr = NULL;
2033    if (memcmp (InputStringPntr, "www.", 4) == 0)
2034      HostStringPntr = InputStringPntr;
2035    else if (memcmp (InputStringPntr, "://", 3) == 0)
2036    {
2037      /* Find the protocol name, and add it as a word such as "ftp:" "http:" */
2038      ProtocolStringPntr = InputStringPntr;
2039      while (ProtocolStringPntr > BufferPntr &&
2040      isalpha (ProtocolStringPntr[-1]))
2041        ProtocolStringPntr--;
2042      Word.assign (ProtocolStringPntr,
2043        (InputStringPntr - ProtocolStringPntr) + 1 /* for the colon */);
2044      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2045      HostStringPntr = InputStringPntr + 3; /* Skip past the "://" */
2046    }
2047    if (HostStringPntr == NULL)
2048    {
2049      InputStringPntr++;
2050      continue;
2051    }
2052
2053    /* Got a host name string starting at HostStringPntr.  It's everything
2054    until the next slash or space, like "user:password@computer:port". */
2055
2056    InputStringPntr = HostStringPntr;
2057    AtSignStringPntr = NULL;
2058    while (InputStringPntr < InputStringEndPntr &&
2059    (*InputStringPntr != '/' && !isspace (*InputStringPntr)))
2060    {
2061      if (*InputStringPntr == '@')
2062        AtSignStringPntr = InputStringPntr;
2063      InputStringPntr++;
2064    }
2065    if (AtSignStringPntr != NULL)
2066    {
2067      /* Add a word with the user and password, unseparated. */
2068      Word.assign (HostStringPntr,
2069        AtSignStringPntr - HostStringPntr + 1 /* for the @ sign */);
2070      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2071      HostStringPntr = AtSignStringPntr + 1;
2072    }
2073
2074    /* Add a word with the computer and port, unseparated. */
2075
2076    Word.assign (HostStringPntr, InputStringPntr - HostStringPntr);
2077    AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2078
2079    /* Now get the path name, not including the extra junk after ?  and #
2080    separators (they're stored as separate options).  Stops at white space or a
2081    double quote mark. */
2082
2083    PathStringPntr = InputStringPntr;
2084    OptionsStringPntr = NULL;
2085    while (InputStringPntr < InputStringEndPntr &&
2086    (*InputStringPntr != '"' && !isspace (*InputStringPntr)))
2087    {
2088      if (OptionsStringPntr == NULL &&
2089      (*InputStringPntr == '?' || *InputStringPntr == '#'))
2090        OptionsStringPntr = InputStringPntr;
2091      InputStringPntr++;
2092    }
2093
2094    if (OptionsStringPntr == NULL)
2095    {
2096      /* No options, all path. */
2097      Word.assign (PathStringPntr, InputStringPntr - PathStringPntr);
2098      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2099    }
2100    else
2101    {
2102      /* Insert the path before the options. */
2103      Word.assign (PathStringPntr, OptionsStringPntr - PathStringPntr);
2104      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2105
2106      /* Insert all the options as a word. */
2107      Word.assign (OptionsStringPntr, InputStringPntr - OptionsStringPntr);
2108      AddWordAndPrefixToSet (Word, PrefixString, WordSet);
2109    }
2110  }
2111  return NumberOfBytes;
2112}
2113
2114
2115/* Replace long Asian words (likely to actually be sentences) with the first
2116character in the word. */
2117
2118static size_t TokenizerPassTruncateLongAsianWords (
2119  char *BufferPntr,
2120  size_t NumberOfBytes)
2121{
2122  char *EndOfStringPntr;
2123  char *InputStringPntr;
2124  int   Letter;
2125  char *OutputStringPntr;
2126  char *StartOfInputLongUnicodeWord;
2127  char *StartOfOutputLongUnicodeWord;
2128
2129  InputStringPntr = BufferPntr;
2130  EndOfStringPntr = InputStringPntr + NumberOfBytes;
2131  OutputStringPntr = InputStringPntr;
2132  StartOfInputLongUnicodeWord = NULL; /* Non-NULL flags it as started. */
2133  StartOfOutputLongUnicodeWord = NULL;
2134
2135  /* Copy the text from the input to the output (same buffer), but when we find
2136  a sequence of UTF-8 characters that is too long then truncate it down to one
2137  character and reset the output pointer to be after that character, thus
2138  deleting the word.  Replacing the deleted characters after it with spaces
2139  won't work since we need to preserve the lack of space to handle those sneaky
2140  HTML artificial word breakers.  So that Thelongword<blah>ing becomes
2141  "T<blah>ing" rather than "T <blah>ing", so the next step joins them up into
2142  "Ting" rather than "T" and "ing".  The first code in a UTF-8 character is
2143  11xxxxxx and subsequent ones are 10xxxxxx. */
2144
2145  while (InputStringPntr < EndOfStringPntr)
2146  {
2147    Letter = (unsigned char) *InputStringPntr;
2148    if (Letter < 128) // Got a regular ASCII letter?
2149    {
2150      if (StartOfInputLongUnicodeWord != NULL)
2151      {
2152        if (InputStringPntr - StartOfInputLongUnicodeWord >
2153        (int) g_MaxWordLength * 2)
2154        {
2155          /* Need to truncate the long word (100 bytes or about 50 characters)
2156          back down to the first UTF-8 character, so find out where the first
2157          character ends (skip past the 10xxxxxx bytes), and rewind the output
2158          pointer to be just after that (ignoring the rest of the long word in
2159          effect). */
2160
2161          OutputStringPntr = StartOfOutputLongUnicodeWord + 1;
2162          while (OutputStringPntr < InputStringPntr)
2163          {
2164            Letter = (unsigned char) *OutputStringPntr;
2165            if (Letter < 128 || Letter >= 192)
2166              break;
2167            ++OutputStringPntr; // Still a UTF-8 middle of the character code.
2168          }
2169        }
2170        StartOfInputLongUnicodeWord = NULL;
2171      }
2172    }
2173    else if (Letter >= 192 && StartOfInputLongUnicodeWord == NULL)
2174    {
2175      /* Got the start of a UTF-8 character.  Remember the spot so we can see
2176      if this is a too long UTF-8 word, which is often a whole sentence in
2177      asian languages, since they sort of use a single character per word. */
2178
2179      StartOfInputLongUnicodeWord = InputStringPntr;
2180      StartOfOutputLongUnicodeWord = OutputStringPntr;
2181    }
2182    *OutputStringPntr++ = *InputStringPntr++;
2183  }
2184  return OutputStringPntr - BufferPntr;
2185}
2186
2187
2188/* Find all the words in the string and add them to our local set of words.
2189The characters considered white space are defined by g_SpaceCharacters.  This
2190function is also used as a subroutine by other tokenizer functions when they
2191have a bunch of presumably plain text they want broken into words and added. */
2192
2193static size_t TokenizerPassGetPlainWords (
2194  char *BufferPntr,
2195  size_t NumberOfBytes,
2196  char PrefixCharacter,
2197  set<string> &WordSet)
2198{
2199  string  AccumulatedWord;
2200  char   *EndOfStringPntr;
2201  size_t  Length;
2202  int     Letter;
2203
2204  if (NumberOfBytes <= 0)
2205    return 0; /* Nothing to process. */
2206
2207  if (PrefixCharacter != 0)
2208    AccumulatedWord = PrefixCharacter;
2209  EndOfStringPntr = BufferPntr + NumberOfBytes;
2210  while (true)
2211  {
2212    if (BufferPntr >= EndOfStringPntr)
2213      Letter = EOF; // Usually a negative number.
2214    else
2215      Letter = (unsigned char) *BufferPntr++;
2216
2217    /* See if it is a letter we treat as white space.  Some word separators
2218    like dashes and periods aren't considered as space.  Note that codes above
2219    127 are UTF-8 characters, which we consider non-space. */
2220
2221    if (Letter < 0 /* EOF is -1 */ ||
2222    (Letter < 128 && g_SpaceCharacters[Letter]))
2223    {
2224      /* That space finished off a word.  Remove trailing periods... */
2225
2226      while ((Length = AccumulatedWord.size()) > 0 &&
2227      AccumulatedWord [Length-1] == '.')
2228        AccumulatedWord.resize (Length - 1);
2229
2230      /* If there's anything left in the word, add it to the set.  Also ignore
2231      words which are too big (it's probably some binary encoded data).  But
2232      leave room for supercalifragilisticexpialidoceous.  According to one web
2233      site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
2234      currently in English.  Note that some uuencoded data was seen with a 60
2235      character line length. */
2236
2237      if (PrefixCharacter != 0)
2238        Length--; // Don't count prefix when judging size or emptiness.
2239      if (Length > 0 && Length <= g_MaxWordLength)
2240        WordSet.insert (AccumulatedWord);
2241
2242      /* Empty out the string to get ready for the next word.  Not quite empty,
2243      start it off with the prefix character if any. */
2244
2245      if (PrefixCharacter != 0)
2246        AccumulatedWord = PrefixCharacter;
2247      else
2248        AccumulatedWord.resize (0);
2249    }
2250    else /* Not a space-like character, add it to the word. */
2251      AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
2252
2253    if (Letter < 0)
2254      break; /* End of data.  Exit here so that last word got processed. */
2255  }
2256  return NumberOfBytes;
2257}
2258
2259
2260/* Delete Things from the text.  The Thing is marked by a start string and an
2261end string, such as "<!--" and "--> for HTML comment things.  All the text
2262between the markers will be added to the word list before it gets deleted from
2263the buffer.  The markers must be prepared in lower case and the buffer is
2264assumed to have already been converted to lower case.  You can specify an empty
2265string for the end marker if you're just matching a string constant like
2266"&nbsp;", which you would put in the starting marker.  This is a utility
2267function used by other tokenizer functions. */
2268
2269static size_t TokenizerUtilRemoveStartEndThing (
2270  char *BufferPntr,
2271  size_t NumberOfBytes,
2272  char PrefixCharacter,
2273  set<string> &WordSet,
2274  const char *ThingStartCode,
2275  const char *ThingEndCode,
2276  bool ReplaceWithSpace)
2277{
2278  char *EndOfStringPntr;
2279  bool  FoundAndDeletedThing;
2280  char *InputStringPntr;
2281  char *OutputStringPntr;
2282  int   ThingEndLength;
2283  char *ThingEndPntr;
2284  int   ThingStartLength;
2285
2286  InputStringPntr = BufferPntr;
2287  EndOfStringPntr = InputStringPntr + NumberOfBytes;
2288  OutputStringPntr = InputStringPntr;
2289  ThingStartLength = strlen (ThingStartCode);
2290  ThingEndLength = strlen (ThingEndCode);
2291
2292  if (ThingStartLength <= 0)
2293    return NumberOfBytes; /* Need some things to look for first! */
2294
2295  while (InputStringPntr < EndOfStringPntr)
2296  {
2297    /* Search for the starting marker. */
2298
2299    FoundAndDeletedThing = false;
2300    if (EndOfStringPntr - InputStringPntr >=
2301    ThingStartLength + ThingEndLength /* space remains for start + end */ &&
2302    *InputStringPntr == *ThingStartCode &&
2303    memcmp (InputStringPntr, ThingStartCode, ThingStartLength) == 0)
2304    {
2305      /* Found the start marker.  Look for the terminating string.  If it is an
2306      empty string, then we've found it right now! */
2307
2308      ThingEndPntr = InputStringPntr + ThingStartLength;
2309      while (EndOfStringPntr - ThingEndPntr >= ThingEndLength)
2310      {
2311        if (ThingEndLength == 0 ||
2312        (*ThingEndPntr == *ThingEndCode &&
2313        memcmp (ThingEndPntr, ThingEndCode, ThingEndLength) == 0))
2314        {
2315          /* Got the end of the Thing.  First dump the text inbetween the start
2316          and end markers into the words list. */
2317
2318          TokenizerPassGetPlainWords (InputStringPntr + ThingStartLength,
2319            ThingEndPntr - (InputStringPntr + ThingStartLength),
2320            PrefixCharacter, WordSet);
2321
2322          /* Delete by not updating the output pointer while moving the input
2323          pointer to just after the ending tag. */
2324
2325          InputStringPntr = ThingEndPntr + ThingEndLength;
2326          if (ReplaceWithSpace)
2327            *OutputStringPntr++ = ' ';
2328          FoundAndDeletedThing = true;
2329          break;
2330        }
2331        ThingEndPntr++;
2332      } /* End while ThingEndPntr */
2333    }
2334    if (!FoundAndDeletedThing)
2335      *OutputStringPntr++ = *InputStringPntr++;
2336  } /* End while InputStringPntr */
2337
2338  return OutputStringPntr - BufferPntr;
2339}
2340
2341
2342static size_t TokenizerPassRemoveHTMLComments (
2343  char *BufferPntr,
2344  size_t NumberOfBytes,
2345  char PrefixCharacter,
2346  set<string> &WordSet)
2347{
2348  return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2349    PrefixCharacter, WordSet, "<!--", "-->", false);
2350}
2351
2352
2353static size_t TokenizerPassRemoveHTMLStyle (
2354  char *BufferPntr,
2355  size_t NumberOfBytes,
2356  char PrefixCharacter,
2357  set<string> &WordSet)
2358{
2359  return TokenizerUtilRemoveStartEndThing (BufferPntr, NumberOfBytes,
2360    PrefixCharacter, WordSet,
2361    "<style", "/style>", false /* replace with space if true */);
2362}
2363
2364
2365/* Convert Japanese periods (a round hollow dot symbol) to spaces so that the
2366start of the next sentence is recognised at least as the start of a very long
2367word.  The Japanese comma also does the same job. */
2368
2369static size_t TokenizerPassJapanesePeriodsToSpaces (
2370  char *BufferPntr,
2371  size_t NumberOfBytes,
2372  char PrefixCharacter,
2373  set<string> &WordSet)
2374{
2375  size_t BytesRemaining = NumberOfBytes;
2376
2377  BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2378    BytesRemaining, PrefixCharacter, WordSet, "���" /* period */, "", true);
2379  BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2380    BytesRemaining, PrefixCharacter, WordSet, "���" /* comma */, "", true);
2381  return BytesRemaining;
2382}
2383
2384
2385/* Delete HTML tags from the text.  The contents of the tag are added as words
2386before being deleted.  <P>, <BR> and &nbsp; are replaced by spaces at this
2387stage while other HTML things get replaced by nothing. */
2388
2389static size_t TokenizerPassRemoveHTMLTags (
2390  char *BufferPntr,
2391  size_t NumberOfBytes,
2392  char PrefixCharacter,
2393  set<string> &WordSet)
2394{
2395  size_t BytesRemaining = NumberOfBytes;
2396
2397  BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2398    BytesRemaining, PrefixCharacter, WordSet, "&nbsp;", "", true);
2399  BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2400    BytesRemaining, PrefixCharacter, WordSet, "<p", ">", true);
2401  BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2402    BytesRemaining, PrefixCharacter, WordSet, "<br", ">", true);
2403  BytesRemaining = TokenizerUtilRemoveStartEndThing (BufferPntr,
2404    BytesRemaining, PrefixCharacter, WordSet, "<", ">", false);
2405  return BytesRemaining;
2406}
2407
2408
2409
2410/******************************************************************************
2411 * Implementation of the ABSApp class, constructor, destructor and the rest of
2412 * the member functions in mostly alphabetical order.
2413 */
2414
2415ABSApp::ABSApp ()
2416: BApplication (g_ABSAppSignature),
2417  m_DatabaseHasChanged (false),
2418  m_SettingsHaveChanged (false)
2419{
2420  status_t    ErrorCode;
2421  int         HalvingCount;
2422  int         i;
2423  const void *ResourceData;
2424  size_t      ResourceSize;
2425  BResources *ResourcesPntr;
2426
2427  MakeDatabaseEmpty ();
2428
2429  /* Set up the pathname which identifies our settings directory.  Note that
2430  the actual settings are loaded later on (or set to defaults) by the main()
2431  function, before this BApplication starts running.  So we don't bother
2432  initialising the other setting related variables here. */
2433
2434  ErrorCode =
2435    find_directory (B_USER_SETTINGS_DIRECTORY, &m_SettingsDirectoryPath);
2436  if (ErrorCode == B_OK)
2437    ErrorCode = m_SettingsDirectoryPath.Append (g_SettingsDirectoryName);
2438  if (ErrorCode != B_OK)
2439    m_SettingsDirectoryPath.SetTo (".");
2440
2441  /* Set up the table which identifies which characters are spaces and which
2442  are not.  Spaces are all control characters and all punctuation except for:
2443  apostrophe (so "it's" and possessive versions of words get stored), dash (for
2444  hyphenated words), dollar sign (for cash amounts), period (for IP addresses,
2445  we later remove trailing periods). */
2446
2447  memset (g_SpaceCharacters, 1, sizeof (g_SpaceCharacters));
2448  g_SpaceCharacters['\''] = false;
2449  g_SpaceCharacters['-'] = false;
2450  g_SpaceCharacters['$'] = false;
2451  g_SpaceCharacters['.'] = false;
2452  for (i = '0'; i <= '9'; i++)
2453    g_SpaceCharacters[i] = false;
2454  for (i = 'A'; i <= 'Z'; i++)
2455    g_SpaceCharacters[i] = false;
2456  for (i = 'a'; i <= 'z'; i++)
2457    g_SpaceCharacters[i] = false;
2458
2459  /* Initialise the busy cursor from data in the application's resources. */
2460
2461  if ((ResourcesPntr = AppResources ()) != NULL && (ResourceData =
2462  ResourcesPntr->LoadResource ('CURS', "Busy Cursor", &ResourceSize)) != NULL
2463  && ResourceSize >= 68 /* Size of a raw 2x16x16x8+4 cursor is 68 bytes */)
2464    g_BusyCursor = new BCursor (ResourceData);
2465
2466  /* Find out the smallest usable double by seeing how small we can make it. */
2467
2468  m_SmallestUseableDouble = 1.0;
2469  HalvingCount = 0;
2470  while (HalvingCount < 10000 && m_SmallestUseableDouble > 0.0)
2471  {
2472    HalvingCount++;
2473    m_SmallestUseableDouble /= 2;
2474  }
2475
2476  /* Recreate the number.  But don't make quite as small, we want to allow some
2477  precision bits and a bit of extra margin for intermediate results in future
2478  calculations. */
2479
2480  HalvingCount -= 50 + sizeof (double) * 8;
2481
2482  m_SmallestUseableDouble = 1.0;
2483  while (HalvingCount > 0)
2484  {
2485    HalvingCount--;
2486    m_SmallestUseableDouble /= 2;
2487  }
2488}
2489
2490
2491ABSApp::~ABSApp ()
2492{
2493  status_t ErrorCode;
2494  char     ErrorMessage [PATH_MAX + 1024];
2495
2496  if (m_SettingsHaveChanged)
2497    LoadSaveSettings (false /* DoLoad */);
2498  if ((ErrorCode = SaveDatabaseIfNeeded (ErrorMessage)) != B_OK)
2499    DisplayErrorMessage (ErrorMessage, ErrorCode, "Exiting Error");
2500  delete g_BusyCursor;
2501  g_BusyCursor = NULL;
2502}
2503
2504
2505/* Display a box showing information about this program. */
2506
2507void
2508ABSApp::AboutRequested ()
2509{
2510  BAlert *AboutAlertPntr;
2511
2512  AboutAlertPntr = new BAlert ("About",
2513"SpamDBM - Spam Database Manager\n\n"
2514
2515"This is a BeOS program for classifying e-mail messages as spam (unwanted \
2516junk mail) or as genuine mail using a Bayesian statistical approach.  There \
2517is also a Mail Daemon Replacement add-on to filter mail using the \
2518classification statistics collected earlier.\n\n"
2519
2520"Written by Alexander G. M. Smith, fall 2002.\n\n"
2521
2522"The original idea was from Paul Graham's algorithm, which has an excellent \
2523writeup at: http://www.paulgraham.com/spam.html\n\n"
2524
2525"Gary Robinson came up with the improved algorithm, which you can read about \
2526at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html\n\n"
2527
2528"Mr. Robinson, Tim Peters and the SpamBayes mailing list people then \
2529developed the even better chi-squared scoring method.\n\n"
2530
2531"Icon courtesy of Isaac Yonemoto, though it is no longer used since Hormel \
2532doesn't want their meat product associated with junk e-mail.\n\n"
2533
2534"Tokenising code updated in 2005 to use some of the tricks that SpamBayes \
2535uses to extract words from messages.  In particular, HTML is now handled.\n\n"
2536
2537"Released to the public domain, with no warranty.\n"
2538"$Revision: 30630 $\n"
2539"Compiled on " __DATE__ " at " __TIME__ ".", "Done");
2540  if (AboutAlertPntr != NULL)
2541  {
2542    AboutAlertPntr->SetFlags(AboutAlertPntr->Flags() | B_CLOSE_ON_ESCAPE);
2543    AboutAlertPntr->Go ();
2544  }
2545}
2546
2547
2548/* Add the text in the given file to the database as an example of a spam or
2549genuine message, or removes it from the database if you claim it is
2550CL_UNCERTAIN.  Also resets the spam ratio attribute to show the effect of the
2551database change. */
2552
2553status_t ABSApp::AddFileToDatabase (
2554  ClassificationTypes IsSpamOrWhat,
2555  const char *FileName,
2556  char *ErrorMessage)
2557{
2558  status_t ErrorCode;
2559  BFile    MessageFile;
2560  BMessage TempBMessage;
2561
2562  ErrorCode = MessageFile.SetTo (FileName, B_READ_ONLY);
2563  if (ErrorCode != B_OK)
2564  {
2565    sprintf (ErrorMessage, "Unable to open file \"%s\" for reading", FileName);
2566    return ErrorCode;
2567  }
2568
2569  ErrorCode = AddPositionIOToDatabase (IsSpamOrWhat,
2570    &MessageFile, FileName, ErrorMessage);
2571  MessageFile.Unset ();
2572  if (ErrorCode != B_OK)
2573    return ErrorCode;
2574
2575  /* Re-evaluate the file so that the user sees the new ratio attribute. */
2576  return EvaluateFile (FileName, &TempBMessage, ErrorMessage);
2577}
2578
2579
2580/* Add the given text to the database.  The unique words found in MessageIOPntr
2581will be added to the database (incrementing the count for the number of
2582messages using each word, either the spam or genuine count depending on
2583IsSpamOrWhat).  It will remove the message (decrement the word counts) if you
2584specify CL_UNCERTAIN as the new classification.  And if it switches from spam
2585to genuine or vice versa, it will do both - decrement the counts for the old
2586class and increment the counts for the new one.  An attribute will be added to
2587MessageIOPntr (if it is a file) to record that it has been marked as Spam or
2588Genuine (so that it doesn't get added to the database a second time).  If it is
2589being removed from the database, the classification attribute gets removed too.
2590If things go wrong, a non-zero error code will be returned and an explanation
2591written to ErrorMessage (assumed to be at least PATH_MAX + 1024 bytes long).
2592OptionalFileName is just used in the error message to identify the file to the
2593user. */
2594
2595status_t ABSApp::AddPositionIOToDatabase (
2596  ClassificationTypes IsSpamOrWhat,
2597  BPositionIO *MessageIOPntr,
2598  const char *OptionalFileName,
2599  char *ErrorMessage)
2600{
2601  BNode                             *BNodePntr;
2602  char                               ClassificationString [NAME_MAX];
2603  StatisticsMap::iterator            DataIter;
2604  status_t                           ErrorCode = 0;
2605  pair<StatisticsMap::iterator,bool> InsertResult;
2606  uint32                             NewAge;
2607  StatisticsRecord                   NewStatistics;
2608  ClassificationTypes                PreviousClassification;
2609  StatisticsPointer                  StatisticsPntr;
2610  set<string>::iterator              WordEndIter;
2611  set<string>::iterator              WordIter;
2612  set<string>                        WordSet;
2613
2614  NewAge = m_TotalGenuineMessages + m_TotalSpamMessages;
2615  if (NewAge >= 0xFFFFFFF0UL)
2616  {
2617    sprintf (ErrorMessage,
2618      "The database is full!  There are %" B_PRIu32 " messages in "
2619      "it and we can't add any more without overflowing the maximum integer "
2620      "representation in 32 bits", NewAge);
2621    return B_NO_MEMORY;
2622  }
2623
2624  /* Check that this file hasn't already been added to the database. */
2625
2626  PreviousClassification = CL_UNCERTAIN;
2627  BNodePntr = dynamic_cast<BNode *> (MessageIOPntr);
2628  if (BNodePntr != NULL) /* If this thing might have attributes. */
2629  {
2630    ErrorCode = BNodePntr->ReadAttr (g_AttributeNameClassification,
2631      B_STRING_TYPE, 0 /* offset */, ClassificationString,
2632      sizeof (ClassificationString) - 1);
2633    if (ErrorCode <= 0) /* Positive values for the number of bytes read */
2634      strcpy (ClassificationString, "none");
2635    else /* Just in case it needs a NUL at the end. */
2636      ClassificationString [ErrorCode] = 0;
2637
2638    if (strcasecmp (ClassificationString, g_ClassifiedSpam) == 0)
2639      PreviousClassification = CL_SPAM;
2640    else if (strcasecmp (ClassificationString, g_ClassifiedGenuine) == 0)
2641      PreviousClassification = CL_GENUINE;
2642  }
2643
2644  if (!m_IgnorePreviousClassification &&
2645  PreviousClassification != CL_UNCERTAIN)
2646  {
2647    if (IsSpamOrWhat == PreviousClassification)
2648    {
2649      sprintf (ErrorMessage, "Ignoring file \"%s\" since it seems to have "
2650        "already been classified as %s.", OptionalFileName,
2651        g_ClassificationTypeNames [IsSpamOrWhat]);
2652    }
2653    else
2654    {
2655      sprintf (ErrorMessage, "Changing existing classification of file \"%s\" "
2656        "from %s to %s.", OptionalFileName,
2657        g_ClassificationTypeNames [PreviousClassification],
2658        g_ClassificationTypeNames [IsSpamOrWhat]);
2659    }
2660    DisplayErrorMessage (ErrorMessage, 0, "Note");
2661  }
2662
2663  if (!m_IgnorePreviousClassification &&
2664  IsSpamOrWhat == PreviousClassification)
2665    /* Nothing to do if it is already classified correctly and the user doesn't
2666    want double classification. */
2667    return B_OK;
2668
2669  /* Get the list of unique words in the file. */
2670
2671  ErrorCode = GetWordsFromPositionIO (MessageIOPntr, OptionalFileName,
2672    WordSet, ErrorMessage);
2673  if (ErrorCode != B_OK)
2674    return ErrorCode;
2675
2676  /* Update the count of the number of messages processed, with corrections if
2677  reclassifying a message. */
2678
2679  m_DatabaseHasChanged = true;
2680
2681  if (!m_IgnorePreviousClassification &&
2682  PreviousClassification == CL_SPAM && m_TotalSpamMessages > 0)
2683    m_TotalSpamMessages--;
2684
2685  if (IsSpamOrWhat == CL_SPAM)
2686    m_TotalSpamMessages++;
2687
2688  if (!m_IgnorePreviousClassification &&
2689  PreviousClassification == CL_GENUINE && m_TotalGenuineMessages > 0)
2690      m_TotalGenuineMessages--;
2691
2692  if (IsSpamOrWhat == CL_GENUINE)
2693    m_TotalGenuineMessages++;
2694
2695  /* Mark the file's attributes with the new classification.  Don't care if it
2696  fails. */
2697
2698  if (BNodePntr != NULL) /* If this thing might have attributes. */
2699  {
2700    ErrorCode = BNodePntr->RemoveAttr (g_AttributeNameClassification);
2701    if (IsSpamOrWhat != CL_UNCERTAIN)
2702    {
2703      strcpy (ClassificationString, g_ClassificationTypeNames [IsSpamOrWhat]);
2704      ErrorCode = BNodePntr->WriteAttr (g_AttributeNameClassification,
2705        B_STRING_TYPE, 0 /* offset */,
2706        ClassificationString, strlen (ClassificationString) + 1);
2707    }
2708  }
2709
2710  /* Add the words to the database by incrementing or decrementing the counts
2711  for each word as appropriate. */
2712
2713  WordEndIter = WordSet.end ();
2714  for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
2715  {
2716    if ((DataIter = m_WordMap.find (*WordIter)) == m_WordMap.end ())
2717    {
2718      /* No record in the database for the word. */
2719
2720      if (IsSpamOrWhat == CL_UNCERTAIN)
2721        continue; /* Not adding words, don't have to subtract from nothing. */
2722
2723      /* Create a new one record in the database for the new word. */
2724
2725      memset (&NewStatistics, 0, sizeof (NewStatistics));
2726      InsertResult = m_WordMap.insert (
2727        StatisticsMap::value_type (*WordIter, NewStatistics));
2728      if (!InsertResult.second)
2729      {
2730        sprintf (ErrorMessage, "Failed to insert new database entry for "
2731          "word \"%s\", while processing file \"%s\"",
2732          WordIter->c_str (), OptionalFileName);
2733        return B_NO_MEMORY;
2734      }
2735      DataIter = InsertResult.first;
2736      m_WordCount++;
2737    }
2738
2739    /* Got the database record for the word, update the statistics. */
2740
2741    StatisticsPntr = &DataIter->second;
2742
2743    StatisticsPntr->age = NewAge;
2744
2745    /* Can't update m_OldestAge here, since it would take a lot of effort to
2746    find the next older age.  Since it's only used for display, we'll let it be
2747    slightly incorrect.  The next database load or purge will fix it. */
2748
2749    if (IsSpamOrWhat == CL_SPAM)
2750      StatisticsPntr->spamCount++;
2751
2752    if (IsSpamOrWhat == CL_GENUINE)
2753      StatisticsPntr->genuineCount++;
2754
2755    if (!m_IgnorePreviousClassification &&
2756    PreviousClassification == CL_SPAM && StatisticsPntr->spamCount > 0)
2757      StatisticsPntr->spamCount--;
2758
2759    if (!m_IgnorePreviousClassification &&
2760    PreviousClassification == CL_GENUINE && StatisticsPntr->genuineCount > 0)
2761      StatisticsPntr->genuineCount--;
2762  }
2763
2764  return B_OK;
2765}
2766
2767
2768/* Add the text in the string to the database as an example of a spam or
2769genuine message. */
2770
2771status_t ABSApp::AddStringToDatabase (
2772  ClassificationTypes IsSpamOrWhat,
2773  const char *String,
2774  char *ErrorMessage)
2775{
2776  BMemoryIO MemoryIO (String, strlen (String));
2777
2778  return AddPositionIOToDatabase (IsSpamOrWhat, &MemoryIO,
2779   "Memory Buffer" /* OptionalFileName */, ErrorMessage);
2780}
2781
2782
2783/* Given a bunch of text, find the words within it (doing special tricks to
2784extract words from HTML), and add them to the set.  Allow NULs in the text.  If
2785the PrefixCharacter isn't zero then it is prepended to all words found (so you
2786can distinguish words as being from a header or from the body text).  See also
2787TokenizeWhole which does something similar. */
2788
2789void
2790ABSApp::AddWordsToSet (
2791  const char *InputString,
2792  size_t NumberOfBytes,
2793  char PrefixCharacter,
2794  set<string> &WordSet)
2795{
2796  char   *BufferPntr;
2797  size_t  CurrentSize;
2798  int     PassNumber;
2799
2800  /* Copy the input buffer.  The code will be modifying it in-place as HTML
2801  fragments and other junk are deleted. */
2802
2803  BufferPntr = new char [NumberOfBytes];
2804  if (BufferPntr == NULL)
2805    return;
2806  memcpy (BufferPntr, InputString, NumberOfBytes);
2807
2808  /* Do the tokenization.  Each pass does something to the text in the buffer,
2809  and may add words to the word set. */
2810
2811  CurrentSize = NumberOfBytes;
2812  for (PassNumber = 1; PassNumber <= 8 && CurrentSize > 0 ; PassNumber++)
2813  {
2814    switch (PassNumber)
2815    {
2816      case 1: /* Lowercase first, rest of them assume lower case inputs. */
2817        CurrentSize = TokenizerPassLowerCase (BufferPntr, CurrentSize);
2818        break;
2819      case 2: CurrentSize = TokenizerPassJapanesePeriodsToSpaces (
2820        BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2821      case 3: CurrentSize = TokenizerPassTruncateLongAsianWords (
2822        BufferPntr, CurrentSize); break;
2823      case 4: CurrentSize = TokenizerPassRemoveHTMLComments (
2824        BufferPntr, CurrentSize, 'Z', WordSet); break;
2825      case 5: CurrentSize = TokenizerPassRemoveHTMLStyle (
2826        BufferPntr, CurrentSize, 'Z', WordSet); break;
2827      case 6: CurrentSize = TokenizerPassExtractURLs (
2828        BufferPntr, CurrentSize, 'Z', WordSet); break;
2829      case 7: CurrentSize = TokenizerPassRemoveHTMLTags (
2830        BufferPntr, CurrentSize, 'Z', WordSet); break;
2831      case 8: CurrentSize = TokenizerPassGetPlainWords (
2832        BufferPntr, CurrentSize, PrefixCharacter, WordSet); break;
2833      default: break;
2834    }
2835  }
2836
2837  delete [] BufferPntr;
2838}
2839
2840
2841/* The user has provided a command line.  This could actually be from a
2842separate attempt to invoke the program (this application's resource/attributes
2843have the launch flags set to "single launch", so the shell doesn't start the
2844program but instead sends the arguments to the already running instance).  In
2845either case, the command is sent to an intermediary thread where it is
2846asynchronously converted into a scripting message(s) that are sent back to this
2847BApplication.  The intermediary is needed since we can't recursively execute
2848scripting messages while processing a message (this ArgsReceived one). */
2849
2850void
2851ABSApp::ArgvReceived (int32 argc, char **argv)
2852{
2853  if (g_CommanderLooperPntr != NULL)
2854    g_CommanderLooperPntr->CommandArguments (argc, argv);
2855}
2856
2857
2858/* Create a new empty database.  Note that we have to write out the new file
2859immediately, otherwise other operations will see the empty database and then
2860try to load the file, and complain that it doesn't exist.  Now they will see
2861the empty database and redundantly load the empty file. */
2862
2863status_t ABSApp::CreateDatabaseFile (char *ErrorMessage)
2864{
2865  MakeDatabaseEmpty ();
2866  m_DatabaseHasChanged = true;
2867  return SaveDatabaseIfNeeded (ErrorMessage); /* Make it now. */
2868}
2869
2870
2871/* Set the settings to the defaults.  Needed in case there isn't a settings
2872file or it is obsolete. */
2873
2874void
2875ABSApp::DefaultSettings ()
2876{
2877  status_t ErrorCode;
2878  BPath    DatabasePath (m_SettingsDirectoryPath);
2879  char     TempString [PATH_MAX];
2880
2881  /* The default database file is in the settings directory. */
2882
2883  ErrorCode = DatabasePath.Append (g_DefaultDatabaseFileName);
2884  if (ErrorCode != B_OK)
2885    strcpy (TempString, g_DefaultDatabaseFileName); /* Unlikely to happen. */
2886  else
2887    strcpy (TempString, DatabasePath.Path ());
2888  m_DatabaseFileName.SetTo (TempString);
2889
2890  // Users need to be allowed to undo their mistakes...
2891  m_IgnorePreviousClassification = true;
2892  g_ServerMode = true;
2893  m_PurgeAge = 2000;
2894  m_PurgePopularity = 2;
2895  m_ScoringMode = SM_CHISQUARED;
2896  m_TokenizeMode = TM_ANY_TEXT_HEADER;
2897
2898  m_SettingsHaveChanged = true;
2899}
2900
2901
2902/* Deletes the database file, and the backup file, and clears the database but
2903marks it as not changed so that it doesn't get written out when the program
2904exits. */
2905
2906status_t ABSApp::DeleteDatabaseFile (char *ErrorMessage)
2907{
2908  BEntry   FileEntry;
2909  status_t ErrorCode;
2910  int      i;
2911  char     TempString [PATH_MAX+20];
2912
2913  /* Clear the in-memory database. */
2914
2915  MakeDatabaseEmpty ();
2916  m_DatabaseHasChanged = false;
2917
2918  /* Delete the backup files first.  Don't care if it fails. */
2919
2920  for (i = 0; i < g_MaxBackups; i++)
2921  {
2922    strcpy (TempString, m_DatabaseFileName.String ());
2923    sprintf (TempString + strlen (TempString), g_BackupSuffix, i);
2924    ErrorCode = FileEntry.SetTo (TempString);
2925    if (ErrorCode == B_OK)
2926      FileEntry.Remove ();
2927  }
2928
2929  /* Delete the main database file. */
2930
2931  strcpy (TempString, m_DatabaseFileName.String ());
2932  ErrorCode = FileEntry.SetTo (TempString);
2933  if (ErrorCode != B_OK)
2934  {
2935    sprintf (ErrorMessage, "While deleting, failed to make BEntry for "
2936      "\"%s\" (does the directory exist?)", TempString);
2937    return ErrorCode;
2938  }
2939
2940  ErrorCode = FileEntry.Remove ();
2941  if (ErrorCode != B_OK)
2942    sprintf (ErrorMessage, "While deleting, failed to remove file "
2943      "\"%s\"", TempString);
2944
2945  return ErrorCode;
2946}
2947
2948
2949/* Evaluate the given file as being a spam message, and tag it with the
2950resulting spam probability ratio.  If it also has an e-mail subject attribute,
2951remove the [Spam 99.9%] prefix since the number usually changes. */
2952
2953status_t ABSApp::EvaluateFile (
2954  const char *PathName,
2955  BMessage *ReplyMessagePntr,
2956  char *ErrorMessage)
2957{
2958  status_t ErrorCode;
2959  float    TempFloat;
2960  BFile    TextFile;
2961
2962  /* Open the specified file. */
2963
2964  ErrorCode = TextFile.SetTo (PathName, B_READ_ONLY);
2965  if (ErrorCode != B_OK)
2966  {
2967    sprintf (ErrorMessage, "Problems opening file \"%s\" for evaluating",
2968      PathName);
2969    return ErrorCode;
2970  }
2971
2972  ErrorCode =
2973    EvaluatePositionIO (&TextFile, PathName, ReplyMessagePntr, ErrorMessage);
2974
2975  if (ErrorCode == B_OK &&
2976  ReplyMessagePntr->FindFloat (g_ResultName, &TempFloat) == B_OK)
2977  {
2978    TextFile.WriteAttr (g_AttributeNameSpamRatio, B_FLOAT_TYPE,
2979      0 /* offset */, &TempFloat, sizeof (TempFloat));
2980    /* Don't know the spam cutoff ratio, that's in the e-mail filter, so just
2981    blindly remove the prefix, which would have the wrong percentage. */
2982    RemoveSpamPrefixFromSubjectAttribute (&TextFile);
2983  }
2984
2985  return ErrorCode;
2986}
2987
2988
2989/* Evaluate a given file or memory buffer (a BPositionIO handles both cases)
2990for spaminess.  The output is added to the ReplyMessagePntr message, with the
2991probability ratio stored in "result" (0.0 means genuine and 1.0 means spam).
2992It also adds the most significant words (used in the ratio calculation) to the
2993array "words" and the associated per-word probability ratios in "ratios".  If
2994it fails, an error code is returned and an error message written to the
2995ErrorMessage string (which is at least MAX_PATH + 1024 bytes long).
2996OptionalFileName is only used in the error message.
2997
2998The math used for combining the individual word probabilities in my method is
2999based on Gary Robinson's method (formerly it was a variation of Paul Graham's
3000method) or the Chi-Squared method.  It's input is the database of words that
3001has a count of the number of spam and number of genuine messages each word
3002appears in (doesn't matter if it appears more than once in a message, it still
3003counts as 1).
3004
3005The spam word count is divided the by the total number of spam e-mail messages
3006in the database to get the probability of spam and probability of genuineness
3007is similarly computed for a particular word.  The spam probability is divided
3008by the sum of the spam and genuine probabilities to get the Raw Spam Ratio for
3009the word.  It's nearer to 0.0 for genuine and nearer to 1.0 for spam, and can
3010be exactly zero or one too.
3011
3012To avoid multiplying later results by zero, and to compensate for a lack of
3013data points, the Raw Spam Ratio is adjusted towards the 0.5 halfway point.  The
30140.5 is combined with the raw spam ratio, with a weight of 0.45 (determined to
3015be a good value by the "spambayes" mailing list tests) messages applied to the
3016half way point and a weight of the number of spam + genuine messages applied to
3017the raw spam ratio.  This gives you the compensated spam ratio for the word.
3018
3019The top N (150 was good in the spambayes tests) extreme words are selected by
3020the distance of each word's compensated spam ratio from 0.5.  Then the ratios
3021of the words are combined.
3022
3023The Gary Robinson combining (scoring) method gets one value from the Nth root
3024of the product of all the word ratios.  The other is the Nth root of the
3025product of (1 - ratio) for all the words.  The final result is the first value
3026divided by the sum of the two values.  The Nth root helps spread the resulting
3027range of values more evenly between 0.0 and 1.0, otherwise the values all clump
3028together at 0 or 1.  Also you can think of the Nth root as a kind of average
3029for products; it's like a generic word probability which when multiplied by
3030itself N times gives you the same result as the N separate actual word
3031probabilities multiplied together.
3032
3033The Chi-Squared combining (scoring) method assumes that the spam word
3034probabilities are uniformly distributed and computes an error measurement
3035(called chi squared - see http://bmj.com/collections/statsbk/8.shtml for a good
3036tutorial) and then sees how likely that error value would be observed in
3037practice.  If it's rare to observe, then the words are likely not just randomly
3038occuring and it's spammy.  The same is done for genuine words.  The two
3039resulting unlikelynesses are compared to see which is more unlikely, if neither
3040is, then the method says it can't decide.  The SpamBayes notes (see the
3041classifier.py file in CVS in http://sourceforge.net/projects/spambayes) say:
3042
3043"Across vectors of length n, containing random uniformly-distributed
3044probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution with 2*n
3045degrees of freedom.  This has been proven (in some appropriate sense) to be the
3046most sensitive possible test for rejecting the hypothesis that a vector of
3047probabilities is uniformly distributed.  Gary Robinson's original scheme was
3048monotonic *with* this test, but skipped the details.  Turns out that getting
3049closer to the theoretical roots gives a much sharper classification, with a
3050very small (in # of msgs), but also very broad (in range of scores), "middle
3051ground", where most of the mistakes live.  In particular, this scheme seems
3052immune to all forms of "cancellation disease": if there are many strong ham
3053*and* spam clues, this reliably scores close to 0.5.  Most other schemes are
3054extremely certain then -- and often wrong."
3055
3056I did a test with 448 example genuine messages including personal mail (some
3057with HTML attachments) and mailing lists, and 267 spam messages for 27471 words
3058total.  Test messages were more recent messages in the same groups.  Out of 100
3059test genuine messages, with Gary Robinson (0.56 cutoff limit), 1 (1%) was
3060falsely identified as spam and 8 of 73 (11%) spam messages were incorrectly
3061classified as genuine.  With my variation of Paul Graham's scheme (0.90 cutoff)
3062I got 6 of 100 (6%) genuine messages incorrectly marked as spam and 2 of 73
3063(3%) spam messages were incorrectly classified as genuine.  Pretty close, but
3064Robinson's values are more evenly spread out so you can tell just how spammy it
3065is by looking at the number. */
3066
3067struct WordAndRatioStruct
3068{
3069  double        probabilityRatio; /* Actually the compensated ratio. */
3070  const string *wordPntr;
3071
3072  bool operator() ( /* Our less-than comparison function for sorting. */
3073    const WordAndRatioStruct &ItemA,
3074    const WordAndRatioStruct &ItemB) const
3075  {
3076    return
3077      (fabs (ItemA.probabilityRatio - 0.5) <
3078      fabs (ItemB.probabilityRatio - 0.5));
3079  };
3080};
3081
3082status_t ABSApp::EvaluatePositionIO (
3083  BPositionIO *PositionIOPntr,
3084  const char *OptionalFileName,
3085  BMessage *ReplyMessagePntr,
3086  char *ErrorMessage)
3087{
3088  StatisticsMap::iterator            DataEndIter;
3089  StatisticsMap::iterator            DataIter;
3090  status_t                           ErrorCode;
3091  double                             GenuineProbability;
3092  uint32                             GenuineSpamSum;
3093  int                                i;
3094  priority_queue<
3095    WordAndRatioStruct /* Data type stored in the queue */,
3096    vector<WordAndRatioStruct> /* Underlying container */,
3097    WordAndRatioStruct /* Function for comparing elements */>
3098                                     PriorityQueue;
3099  double                             ProductGenuine;
3100  double                             ProductLogGenuine;
3101  double                             ProductLogSpam;
3102  double                             ProductSpam;
3103  double                             RawProbabilityRatio;
3104  float                              ResultRatio;
3105  double                             SpamProbability;
3106  StatisticsPointer                  StatisticsPntr;
3107  double                             TempDouble;
3108  double                             TotalGenuine;
3109  double                             TotalSpam;
3110  WordAndRatioStruct                 WordAndRatio;
3111  set<string>::iterator              WordEndIter;
3112  set<string>::iterator              WordIter;
3113  const WordAndRatioStruct          *WordRatioPntr;
3114  set<string>                        WordSet;
3115
3116  /* Get the list of unique words in the file / memory buffer. */
3117
3118  ErrorCode = GetWordsFromPositionIO (PositionIOPntr, OptionalFileName,
3119    WordSet, ErrorMessage);
3120  if (ErrorCode != B_OK)
3121    return ErrorCode;
3122
3123  /* Prepare a few variables.  Mostly these are stored double values of some of
3124  the numbers involved (to avoid the overhead of multiple conversions from
3125  integer to double), with extra precautions to avoid divide by zero. */
3126
3127  if (m_TotalGenuineMessages <= 0)
3128    TotalGenuine = 1.0;
3129  else
3130    TotalGenuine = m_TotalGenuineMessages;
3131
3132  if (m_TotalSpamMessages <= 0)
3133    TotalSpam = 1.0;
3134  else
3135    TotalSpam = m_TotalSpamMessages;
3136
3137  /* Look up the words in the database and calculate their compensated spam
3138  ratio.  The results are stored in a priority queue so that we can later find
3139  the top g_MaxInterestingWords for doing the actual determination. */
3140
3141  WordEndIter = WordSet.end ();
3142  DataEndIter = m_WordMap.end ();
3143  for (WordIter = WordSet.begin (); WordIter != WordEndIter; WordIter++)
3144  {
3145    WordAndRatio.wordPntr = &(*WordIter);
3146
3147    if ((DataIter = m_WordMap.find (*WordIter)) != DataEndIter)
3148    {
3149      StatisticsPntr = &DataIter->second;
3150
3151      /* Calculate the probability the word is spam and the probability it is
3152      genuine.  Then the raw probability ratio. */
3153
3154      SpamProbability = StatisticsPntr->spamCount / TotalSpam;
3155      GenuineProbability = StatisticsPntr->genuineCount / TotalGenuine;
3156
3157      if (SpamProbability + GenuineProbability > 0)
3158        RawProbabilityRatio =
3159        SpamProbability / (SpamProbability + GenuineProbability);
3160      else /* Word with zero statistics, perhaps due to reclassification. */
3161        RawProbabilityRatio = 0.5;
3162
3163      /* The compensated ratio leans towards 0.5 (g_RobinsonX) more for fewer
3164      data points, with a weight of 0.45 (g_RobinsonS). */
3165
3166      GenuineSpamSum =
3167        StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
3168
3169      WordAndRatio.probabilityRatio =
3170        (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
3171        (g_RobinsonS + GenuineSpamSum);
3172    }
3173    else /* Unknown word. With N=0, compensated ratio equation is RobinsonX. */
3174      WordAndRatio.probabilityRatio = g_RobinsonX;
3175
3176     PriorityQueue.push (WordAndRatio);
3177  }
3178
3179  /* Compute the combined probability (multiply them together) of the top few
3180  words.  To avoid numeric underflow (doubles can only get as small as 1E-300),
3181  logarithms are also used.  But avoid the logarithms (sum of logs of numbers
3182  is the same as the product of numbers) as much as possible due to reduced
3183  accuracy and slowness. */
3184
3185  ProductGenuine = 1.0;
3186  ProductLogGenuine = 0.0;
3187  ProductSpam = 1.0;
3188  ProductLogSpam = 0.0;
3189  for (i = 0;
3190  i < g_MaxInterestingWords && !PriorityQueue.empty();
3191  i++, PriorityQueue.pop())
3192  {
3193    WordRatioPntr = &PriorityQueue.top();
3194    ProductSpam *= WordRatioPntr->probabilityRatio;
3195    ProductGenuine *= 1.0 - WordRatioPntr->probabilityRatio;
3196
3197    /* Check for the numbers getting dangerously small, close to underflowing.
3198    If they are, move the value into the logarithm storage part. */
3199
3200    if (ProductSpam < m_SmallestUseableDouble)
3201    {
3202      ProductLogSpam += log (ProductSpam);
3203      ProductSpam = 1.0;
3204    }
3205
3206    if (ProductGenuine < m_SmallestUseableDouble)
3207    {
3208      ProductLogGenuine += log (ProductGenuine);
3209      ProductGenuine = 1.0;
3210    }
3211
3212    ReplyMessagePntr->AddString ("words", WordRatioPntr->wordPntr->c_str ());
3213    ReplyMessagePntr->AddFloat ("ratios", WordRatioPntr->probabilityRatio);
3214  }
3215
3216  /* Get the resulting log of the complete products. */
3217
3218  if (i > 0)
3219  {
3220    ProductLogSpam += log (ProductSpam);
3221    ProductLogGenuine += log (ProductGenuine);
3222  }
3223
3224  if (m_ScoringMode == SM_ROBINSON)
3225  {
3226    /* Apply Gary Robinson's scoring method where we take the Nth root of the
3227    products.  This is easiest in logarithm form. */
3228
3229    if (i > 0)
3230    {
3231      ProductSpam = exp (ProductLogSpam / i);
3232      ProductGenuine = exp (ProductLogGenuine / i);
3233      ResultRatio = ProductSpam / (ProductGenuine + ProductSpam);
3234    }
3235    else /* Somehow got no words! */
3236      ResultRatio = g_RobinsonX;
3237  }
3238  else if (m_ScoringMode == SM_CHISQUARED)
3239  {
3240    /* From the SpamBayes notes: "We compute two chi-squared statistics, one
3241    for ham and one for spam.  The sum-of-the-logs business is more sensitive
3242    to probs near 0 than to probs near 1, so the spam measure uses 1-p (so that
3243    high-spamprob words have greatest effect), and the ham measure uses p
3244    directly (so that lo-spamprob words have greatest effect)."  That means we
3245    just reversed the meaning of the previously calculated spam and genuine
3246    products!  Oh well. */
3247
3248    TempDouble = ProductLogSpam;
3249    ProductLogSpam = ProductLogGenuine;
3250    ProductLogGenuine = TempDouble;
3251
3252    if (i > 0)
3253    {
3254      ProductSpam =
3255        1.0 - ChiSquaredProbability (-2.0 * ProductLogSpam, 2 * i);
3256      ProductGenuine =
3257        1.0 - ChiSquaredProbability (-2.0 * ProductLogGenuine, 2 * i);
3258
3259      /* The SpamBayes notes say: "How to combine these into a single spam
3260      score?  We originally used (S-H)/(S+H) scaled into [0., 1.], which equals
3261      S/(S+H).  A systematic problem is that we could end up being near-certain
3262      a thing was (for example) spam, even if S was small, provided that H was
3263      much smaller.  Rob Hooft stared at these problems and invented the
3264      measure we use now, the simpler S-H, scaled into [0., 1.]." */
3265
3266      ResultRatio = (ProductSpam - ProductGenuine + 1.0) / 2.0;
3267    }
3268    else /* No words to analyse. */
3269      ResultRatio = 0.5;
3270  }
3271  else /* Unknown scoring mode. */
3272  {
3273    strcpy (ErrorMessage, "Unknown scoring mode specified in settings");
3274    return B_BAD_VALUE;
3275  }
3276
3277  ReplyMessagePntr->AddFloat (g_ResultName, ResultRatio);
3278  return B_OK;
3279}
3280
3281
3282/* Just evaluate the given string as being spam text. */
3283
3284status_t ABSApp::EvaluateString (
3285  const char *BufferPntr,
3286  ssize_t BufferSize,
3287  BMessage *ReplyMessagePntr,
3288  char *ErrorMessage)
3289{
3290  BMemoryIO MemoryIO (BufferPntr, BufferSize);
3291
3292  return EvaluatePositionIO (&MemoryIO, "Memory Buffer",
3293    ReplyMessagePntr, ErrorMessage);
3294}
3295
3296
3297/* Tell other programs about the scripting commands we support.  Try this
3298command: "hey application/x-vnd.agmsmith.spamdbm getsuites" to
3299see it in action (this program has to be already running for it to work). */
3300
3301status_t ABSApp::GetSupportedSuites (BMessage *MessagePntr)
3302{
3303  BPropertyInfo TempPropInfo (g_ScriptingPropertyList);
3304
3305  MessagePntr->AddString ("suites", "suite/x-vnd.agmsmith.spamdbm");
3306  MessagePntr->AddFlat ("messages", &TempPropInfo);
3307  return BApplication::GetSupportedSuites (MessagePntr);
3308}
3309
3310
3311/* Add all the words in the given file or memory buffer to the supplied set.
3312The file name is only there for error messages, it assumes you have already
3313opened the PositionIO to the right file.  If things go wrong, a non-zero error
3314code will be returned and an explanation written to ErrorMessage (assumed to be
3315at least PATH_MAX + 1024 bytes long). */
3316
3317status_t ABSApp::GetWordsFromPositionIO (
3318  BPositionIO *PositionIOPntr,
3319  const char *OptionalFileName,
3320  set<string> &WordSet,
3321  char *ErrorMessage)
3322{
3323  status_t ErrorCode;
3324
3325  if (m_TokenizeMode == TM_WHOLE)
3326    ErrorCode = TokenizeWhole (PositionIOPntr, OptionalFileName,
3327      WordSet, ErrorMessage);
3328  else
3329    ErrorCode = TokenizeParts (PositionIOPntr, OptionalFileName,
3330      WordSet, ErrorMessage);
3331
3332  if (ErrorCode == B_OK && WordSet.empty ())
3333  {
3334    /* ENOMSG usually means no message found in queue, but I'm using it to show
3335    no words, a good indicator of spam which is pure HTML. */
3336
3337    sprintf (ErrorMessage, "No words were found in \"%s\"", OptionalFileName);
3338    ErrorCode = ENOMSG;
3339  }
3340
3341  return ErrorCode;
3342}
3343
3344
3345/* Set up indices for attributes MAIL:classification (string) and
3346MAIL:ratio_spam (float) on all mounted disk volumes that support queries.  Also
3347tell the system to make those attributes visible to the user (so they can see
3348them in Tracker) and associate them with e-mail messages.  Also set up the
3349database file MIME type (provide a description and associate it with this
3350program so that it picks up the right icon).  And register the names for our
3351sound effects. */
3352
3353status_t ABSApp::InstallThings (char *ErrorMessage)
3354{
3355  int32       Cookie;
3356  dev_t       DeviceID;
3357  status_t    ErrorCode = B_OK;
3358  fs_info     FSInfo;
3359  int32       i;
3360  int32       iClassification;
3361  int32       iProbability;
3362  int32       j;
3363  index_info  IndexInfo;
3364  BMimeType   MimeType;
3365  BMessage    Parameters;
3366  const char *StringPntr;
3367  bool        TempBool;
3368  int32       TempInt32;
3369
3370  /* Iterate through all mounted devices and try to make the indices on each
3371  one.  Don't bother if the index exists or the device doesn't support indices
3372  (actually queries). */
3373
3374  Cookie = 0;
3375  while ((DeviceID = next_dev (&Cookie)) >= 0)
3376  {
3377    if (!fs_stat_dev (DeviceID, &FSInfo) && (FSInfo.flags & B_FS_HAS_QUERY))
3378    {
3379      if (fs_stat_index (DeviceID, g_AttributeNameClassification, &IndexInfo)
3380      && errno == B_ENTRY_NOT_FOUND)
3381      {
3382        if (fs_create_index (DeviceID, g_AttributeNameClassification,
3383        B_STRING_TYPE, 0 /* flags */))
3384        {
3385          ErrorCode = errno;
3386          sprintf (ErrorMessage, "Unable to make string index %s on "
3387            "volume #%d, volume name \"%s\", file system type \"%s\", "
3388            "on device \"%s\"", g_AttributeNameClassification,
3389            (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3390            FSInfo.device_name);
3391        }
3392      }
3393
3394      if (fs_stat_index (DeviceID, g_AttributeNameSpamRatio,
3395      &IndexInfo) && errno == B_ENTRY_NOT_FOUND)
3396      {
3397        if (fs_create_index (DeviceID, g_AttributeNameSpamRatio,
3398        B_FLOAT_TYPE, 0 /* flags */))
3399        {
3400          ErrorCode = errno;
3401          sprintf (ErrorMessage, "Unable to make float index %s on "
3402            "volume #%d, volume name \"%s\", file system type \"%s\", "
3403            "on device \"%s\"", g_AttributeNameSpamRatio,
3404            (int) DeviceID, FSInfo.volume_name, FSInfo.fsh_name,
3405            FSInfo.device_name);
3406        }
3407      }
3408    }
3409  }
3410  if (ErrorCode != B_OK)
3411    return ErrorCode;
3412
3413  /* Set up the MIME types for the classification attributes, associate them
3414  with e-mail and make them visible to the user (but not editable).  First need
3415  to get the existing MIME settings, then add ours to them (otherwise the
3416  existing ones get wiped out). */
3417
3418  ErrorCode = MimeType.SetTo ("text/x-email");
3419  if (ErrorCode != B_OK || !MimeType.IsInstalled ())
3420  {
3421    sprintf (ErrorMessage, "No e-mail MIME type (%s) in the system, can't "
3422      "update it to add our special attributes, and without e-mail this "
3423      "program is useless!", MimeType.Type ());
3424    if (ErrorCode == B_OK)
3425      ErrorCode = -1;
3426    return ErrorCode;
3427  }
3428
3429  ErrorCode = MimeType.GetAttrInfo (&Parameters);
3430  if (ErrorCode != B_OK)
3431  {
3432    sprintf (ErrorMessage, "Unable to retrieve list of attributes "
3433      "associated with e-mail messages in the MIME database");
3434    return ErrorCode;
3435  }
3436
3437  for (i = 0, iClassification = -1, iProbability = -1;
3438  i < 1000 && (iClassification < 0 || iProbability < 0);
3439  i++)
3440  {
3441    ErrorCode = Parameters.FindString ("attr:name", i, &StringPntr);
3442    if (ErrorCode != B_OK)
3443      break; /* Reached the end of the attributes. */
3444    if (strcmp (StringPntr, g_AttributeNameClassification) == 0)
3445      iClassification = i;
3446    else if (strcmp (StringPntr, g_AttributeNameSpamRatio) == 0)
3447      iProbability = i;
3448  }
3449
3450  /* Add extra default settings for those programs which previously didn't
3451  update the MIME database with all the attributes that exist (so our new
3452  additions don't show up at the wrong index). */
3453
3454  i--; /* Set i to index of last valid attribute. */
3455
3456  for (j = 0; j <= i; j++)
3457  {
3458    if (Parameters.FindString ("attr:public_name", j, &StringPntr) ==
3459    B_BAD_INDEX)
3460    {
3461      if (Parameters.FindString ("attr:name", j, &StringPntr) != B_OK)
3462        StringPntr = "None!";
3463      Parameters.AddString ("attr:public_name", StringPntr);
3464    }
3465  }
3466
3467  while (Parameters.FindInt32 ("attr:type", i, &TempInt32) == B_BAD_INDEX)
3468    Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3469
3470  while (Parameters.FindBool ("attr:viewable", i, &TempBool) == B_BAD_INDEX)
3471    Parameters.AddBool ("attr:viewable", true);
3472
3473  while (Parameters.FindBool ("attr:editable", i, &TempBool) == B_BAD_INDEX)
3474    Parameters.AddBool ("attr:editable", false);
3475
3476  while (Parameters.FindInt32 ("attr:width", i, &TempInt32) == B_BAD_INDEX)
3477    Parameters.AddInt32 ("attr:width", 60);
3478
3479  while (Parameters.FindInt32 ("attr:alignment", i, &TempInt32) == B_BAD_INDEX)
3480    Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3481
3482  while (Parameters.FindBool ("attr:extra", i, &TempBool) == B_BAD_INDEX)
3483    Parameters.AddBool ("attr:extra", false);
3484
3485  /* Add our new attributes to e-mail related things, if not already there. */
3486
3487  if (iClassification < 0)
3488  {
3489    Parameters.AddString ("attr:name", g_AttributeNameClassification);
3490    Parameters.AddString ("attr:public_name", "Classification Group");
3491    Parameters.AddInt32 ("attr:type", B_STRING_TYPE);
3492    Parameters.AddBool ("attr:viewable", true);
3493    Parameters.AddBool ("attr:editable", false);
3494    Parameters.AddInt32 ("attr:width", 45);
3495    Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3496    Parameters.AddBool ("attr:extra", false);
3497  }
3498
3499  if (iProbability < 0)
3500  {
3501    Parameters.AddString ("attr:name", g_AttributeNameSpamRatio);
3502    Parameters.AddString ("attr:public_name", "Spam/Genuine Estimate");
3503    Parameters.AddInt32 ("attr:type", B_FLOAT_TYPE);
3504    Parameters.AddBool ("attr:viewable", true);
3505    Parameters.AddBool ("attr:editable", false);
3506    Parameters.AddInt32 ("attr:width", 50);
3507    Parameters.AddInt32 ("attr:alignment", B_ALIGN_LEFT);
3508    Parameters.AddBool ("attr:extra", false);
3509  }
3510
3511  if (iClassification < 0 || iProbability < 0)
3512  {
3513    ErrorCode = MimeType.SetAttrInfo (&Parameters);
3514    if (ErrorCode != B_OK)
3515    {
3516      sprintf (ErrorMessage, "Unable to associate the classification "
3517        "attributes with e-mail messages in the MIME database");
3518      return ErrorCode;
3519    }
3520  }
3521
3522  /* Set up the MIME type for the database file. */
3523
3524  sprintf (ErrorMessage, "Problems with setting up MIME type (%s) for "
3525    "the database files", g_ABSDatabaseFileMIMEType); /* A generic message. */
3526
3527  ErrorCode = MimeType.SetTo (g_ABSDatabaseFileMIMEType);
3528  if (ErrorCode != B_OK)
3529    return ErrorCode;
3530
3531  MimeType.Delete ();
3532  ErrorCode = MimeType.Install ();
3533  if (ErrorCode != B_OK)
3534  {
3535    sprintf (ErrorMessage, "Failed to install MIME type (%s) in the system",
3536      MimeType.Type ());
3537    return ErrorCode;
3538  }
3539
3540  MimeType.SetShortDescription ("Spam Database");
3541  MimeType.SetLongDescription ("Bayesian Statistical Database for "
3542    "Classifying Junk E-Mail");
3543  sprintf (ErrorMessage, "1.0 ('%s')", g_DatabaseRecognitionString);
3544  MimeType.SetSnifferRule (ErrorMessage);
3545  MimeType.SetPreferredApp (g_ABSAppSignature);
3546
3547  /* Set up the names of the sound effects.  Later on the user can associate
3548  sound files with the names by using the Sounds preferences panel or the
3549  installsound command.  The MDR add-on filter will trigger these sounds. */
3550
3551  add_system_beep_event (g_BeepGenuine);
3552  add_system_beep_event (g_BeepSpam);
3553  add_system_beep_event (g_BeepUncertain);
3554
3555  return B_OK;
3556}
3557
3558
3559/* Load the database if it hasn't been loaded yet.  Otherwise do nothing. */
3560
3561status_t ABSApp::LoadDatabaseIfNeeded (char *ErrorMessage)
3562{
3563  if (m_WordMap.empty ())
3564    return LoadSaveDatabase (true /* DoLoad */, ErrorMessage);
3565
3566  return B_OK;
3567}
3568
3569
3570/* Either load the database of spam words (DoLoad is TRUE) from the file
3571specified in the settings, or write (DoLoad is FALSE) the database to it.  If
3572it doesn't exist (and its parent directories do exist) then it will be created
3573when saving.  If it doesn't exist when loading, the in-memory database will be
3574set to an empty one and an error will be returned with an explanation put into
3575ErrorMessage (should be big enough for a path name and a couple of lines of
3576text).
3577
3578The database file format is a UTF-8 text file (well, there could be some
3579latin-1 characters and other junk in there - it just copies the bytes from the
3580e-mail messages directly), with tab characters to separate fields (so that you
3581can also load it into a spreadsheet).  The first line identifies the overall
3582file type.  The second lists pairs of classifications plus the number of
3583messages in each class.  Currently it is just Genuine and Spam, but for future
3584compatability, that could be followed by more classification pairs.  The
3585remaining lines each contain a word, the date it was last updated (actually
3586it's the number of messages in the database when the word was added, smaller
3587numbers mean it was updated longer ago), the genuine count and the spam count.
3588*/
3589
3590status_t ABSApp::LoadSaveDatabase (bool DoLoad, char *ErrorMessage)
3591{
3592  time_t                             CurrentTime;
3593  FILE                              *DatabaseFile = NULL;
3594  BNode                              DatabaseNode;
3595  BNodeInfo                          DatabaseNodeInfo;
3596  StatisticsMap::iterator            DataIter;
3597  StatisticsMap::iterator            EndIter;
3598  status_t                           ErrorCode;
3599  int                                i;
3600  pair<StatisticsMap::iterator,bool> InsertResult;
3601  char                               LineString [10240];
3602  StatisticsRecord                   Statistics;
3603  const char                        *StringPntr;
3604  char                              *TabPntr;
3605  const char                        *WordPntr;
3606
3607  if (DoLoad)
3608  {
3609    MakeDatabaseEmpty ();
3610    m_DatabaseHasChanged = false; /* In case of early error exit. */
3611  }
3612  else /* Saving the database, backup the old version on disk. */
3613  {
3614    ErrorCode = MakeBackup (ErrorMessage);
3615    if (ErrorCode != B_OK) /* Usually because the directory isn't there. */
3616      return ErrorCode;
3617  }
3618
3619  DatabaseFile = fopen (m_DatabaseFileName.String (), DoLoad ? "rb" : "wb");
3620  if (DatabaseFile == NULL)
3621  {
3622    ErrorCode = errno;
3623    sprintf (ErrorMessage, "Can't open database file \"%s\" for %s",
3624      m_DatabaseFileName.String (), DoLoad ? "reading" : "writing");
3625    goto ErrorExit;
3626  }
3627
3628  /* Process the first line, which identifies the file. */
3629
3630  if (DoLoad)
3631  {
3632    sprintf (ErrorMessage, "Can't read first line of database file \"%s\", "
3633      "expected it to start with \"%s\"",
3634      m_DatabaseFileName.String (), g_DatabaseRecognitionString);
3635    ErrorCode = -1;
3636
3637    if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3638      goto ErrorExit;
3639    if (strncmp (LineString, g_DatabaseRecognitionString,
3640    strlen (g_DatabaseRecognitionString)) != 0)
3641      goto ErrorExit;
3642  }
3643  else /* Saving */
3644  {
3645    CurrentTime = time (NULL);
3646    if (fprintf (DatabaseFile, "%s V1 (word, age, genuine count, spam count)\t"
3647    "Written by SpamDBM $Revision: 30630 $\t"
3648    "Compiled on " __DATE__ " at " __TIME__ "\tThis file saved on %s",
3649    g_DatabaseRecognitionString, ctime (&CurrentTime)) <= 0)
3650    {
3651      ErrorCode = errno;
3652      sprintf (ErrorMessage, "Problems when writing to database file \"%s\"",
3653        m_DatabaseFileName.String ());
3654      goto ErrorExit;
3655    }
3656  }
3657
3658  /* The second line lists the different classifications.  We just check to see
3659  that the first two are Genuine and Spam.  If there are others, they'll be
3660  ignored and lost when the database is saved. */
3661
3662  if (DoLoad)
3663  {
3664    sprintf (ErrorMessage, "Can't read second line of database file \"%s\", "
3665      "expected it to list classifications %s and %s along with their totals",
3666      m_DatabaseFileName.String (), g_ClassifiedGenuine, g_ClassifiedSpam);
3667    ErrorCode = B_BAD_VALUE;
3668
3669    if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3670      goto ErrorExit;
3671    i = strlen (LineString);
3672    if (i > 0 && LineString[i-1] == '\n')
3673      LineString[i-1] = 0; /* Remove trailing line feed character. */
3674
3675    /* Look for the title word at the start of the line. */
3676
3677    TabPntr = LineString;
3678    for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3679      ;
3680    if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3681
3682    if (strncmp (StringPntr, "Classifications", 15) != 0)
3683      goto ErrorExit;
3684
3685    /* Look for the Genuine class and count. */
3686
3687    for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3688      ;
3689    if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3690
3691    if (strcmp (StringPntr, g_ClassifiedGenuine) != 0)
3692      goto ErrorExit;
3693
3694    for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3695      ;
3696    if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3697
3698    m_TotalGenuineMessages = atoll (StringPntr);
3699
3700    /* Look for the Spam class and count. */
3701
3702    for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3703      ;
3704    if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3705
3706    if (strcmp (StringPntr, g_ClassifiedSpam) != 0)
3707      goto ErrorExit;
3708
3709    for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3710      ;
3711    if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3712
3713    m_TotalSpamMessages = atoll (StringPntr);
3714  }
3715  else /* Saving */
3716  {
3717    fprintf (DatabaseFile,
3718      "Classifications and total messages:\t%s\t%" B_PRIu32
3719        "\t%s\t%" B_PRIu32 "\n",
3720      g_ClassifiedGenuine, m_TotalGenuineMessages,
3721      g_ClassifiedSpam, m_TotalSpamMessages);
3722  }
3723
3724  /* The remainder of the file is the list of words and statistics.  Each line
3725  has a word, a tab, the time when the word was last changed in the database
3726  (sequence number of message addition, starts at 0 and goes up by one for each
3727  message added to the database), a tab then the number of messages in the
3728  first class (genuine) that had that word, then a tab, then the number of
3729  messages in the second class (spam) with that word, and so on. */
3730
3731  if (DoLoad)
3732  {
3733    while (!feof (DatabaseFile))
3734    {
3735      if (fgets (LineString, sizeof (LineString), DatabaseFile) == NULL)
3736      {
3737        ErrorCode = errno;
3738        if (feof (DatabaseFile))
3739          break;
3740        if (ErrorCode == B_OK)
3741          ErrorCode = -1;
3742        sprintf (ErrorMessage, "Error while reading words and statistics "
3743          "from database file \"%s\"", m_DatabaseFileName.String ());
3744        goto ErrorExit;
3745      }
3746
3747      i = strlen (LineString);
3748      if (i > 0 && LineString[i-1] == '\n')
3749        LineString[i-1] = 0; /* Remove trailing line feed character. */
3750
3751      /* Get the word at the start of the line, save in WordPntr. */
3752
3753      TabPntr = LineString;
3754      for (WordPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3755        ;
3756      if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3757
3758      /* Get the date stamp.  Actually a sequence number, not a date. */
3759
3760      for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3761        ;
3762      if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3763
3764      Statistics.age = atoll (StringPntr);
3765
3766      /* Get the Genuine count. */
3767
3768      for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3769        ;
3770      if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3771
3772      Statistics.genuineCount = atoll (StringPntr);
3773
3774      /* Get the Spam count. */
3775
3776      for (StringPntr = TabPntr; *TabPntr != 0 && *TabPntr != '\t'; TabPntr++)
3777        ;
3778      if (*TabPntr == '\t') *TabPntr++ = 0; /* Stringify up to next tab. */
3779
3780      Statistics.spamCount = atoll (StringPntr);
3781
3782      /* Ignore empty words, totally unused words and ones which are too long
3783      (avoids lots of length checking everywhere). */
3784
3785      if (WordPntr[0] == 0 || strlen (WordPntr) > g_MaxWordLength ||
3786      (Statistics.genuineCount <= 0 && Statistics.spamCount <= 0))
3787        continue; /* Ignore this line of text, start on next one. */
3788
3789      /* Add the combination to the database. */
3790
3791      InsertResult = m_WordMap.insert (
3792        StatisticsMap::value_type (WordPntr, Statistics));
3793      if (InsertResult.second == false)
3794      {
3795        ErrorCode = B_BAD_VALUE;
3796        sprintf (ErrorMessage, "Error while inserting word \"%s\" from "
3797          "database \"%s\", perhaps it is a duplicate",
3798          WordPntr, m_DatabaseFileName.String ());
3799        goto ErrorExit;
3800      }
3801      m_WordCount++;
3802
3803      /* And the hunt for the oldest word. */
3804
3805      if (Statistics.age < m_OldestAge)
3806        m_OldestAge = Statistics.age;
3807    }
3808  }
3809  else /* Saving, dump all words and statistics to the file. */
3810  {
3811    EndIter = m_WordMap.end ();
3812    for (DataIter = m_WordMap.begin (); DataIter != EndIter; DataIter++)
3813    {
3814      if (fprintf (DatabaseFile,
3815      "%s\t%" B_PRIu32 "\t%" B_PRIu32 "\t%" B_PRIu32 "\n",
3816      DataIter->first.c_str (), DataIter->second.age,
3817      DataIter->second.genuineCount, DataIter->second.spamCount) <= 0)
3818      {
3819        ErrorCode = errno;
3820        sprintf (ErrorMessage, "Error while writing word \"%s\" to "
3821          "database \"%s\"",
3822          DataIter->first.c_str(), m_DatabaseFileName.String ());
3823        goto ErrorExit;
3824      }
3825    }
3826  }
3827
3828  /* Set the file type so that the new file gets associated with this program,
3829  and picks up the right icon. */
3830
3831  if (!DoLoad)
3832  {
3833    sprintf (ErrorMessage, "Unable to set attributes (file type) of database "
3834      "file \"%s\"", m_DatabaseFileName.String ());
3835    ErrorCode = DatabaseNode.SetTo (m_DatabaseFileName.String ());
3836    if (ErrorCode != B_OK)
3837      goto ErrorExit;
3838    DatabaseNodeInfo.SetTo (&DatabaseNode);
3839    ErrorCode = DatabaseNodeInfo.SetType (g_ABSDatabaseFileMIMEType);
3840    if (ErrorCode != B_OK)
3841      goto ErrorExit;
3842  }
3843
3844  /* Success! */
3845  m_DatabaseHasChanged = false;
3846  ErrorCode = B_OK;
3847
3848ErrorExit:
3849  if (DatabaseFile != NULL)
3850    fclose (DatabaseFile);
3851  return ErrorCode;
3852}
3853
3854
3855/* Either load the settings (DoLoad is TRUE) from the configuration file or
3856write them (DoLoad is FALSE) to it.  The configuration file is a flattened
3857BMessage containing the various program settings.  If it doesn't exist (and its
3858parent directories don't exist) then it will be created when saving.  If it
3859doesn't exist when loading, the settings will be set to default values. */
3860
3861status_t ABSApp::LoadSaveSettings (bool DoLoad)
3862{
3863  status_t    ErrorCode;
3864  const char *NamePntr;
3865  BMessage    Settings;
3866  BDirectory  SettingsDirectory;
3867  BFile       SettingsFile;
3868  const char *StringPntr;
3869  bool        TempBool;
3870  int32       TempInt32;
3871  char        TempString [PATH_MAX + 100];
3872
3873  /* Preset things to default values if loading, in case of an error or it's an
3874  older version of the settings file which doesn't have every field defined. */
3875
3876  if (DoLoad)
3877    DefaultSettings ();
3878
3879  /* Look for our settings directory.  When saving we can try to create it. */
3880
3881  ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3882  if (ErrorCode != B_OK)
3883  {
3884    if (DoLoad || ErrorCode != B_ENTRY_NOT_FOUND)
3885    {
3886      sprintf (TempString, "Can't find settings directory \"%s\"",
3887        m_SettingsDirectoryPath.Path ());
3888      goto ErrorExit;
3889    }
3890    ErrorCode = create_directory (m_SettingsDirectoryPath.Path (), 0755);
3891    if (ErrorCode == B_OK)
3892      ErrorCode = SettingsDirectory.SetTo (m_SettingsDirectoryPath.Path ());
3893    if (ErrorCode != B_OK)
3894    {
3895      sprintf (TempString, "Can't create settings directory \"%s\"",
3896        m_SettingsDirectoryPath.Path ());
3897      goto ErrorExit;
3898    }
3899  }
3900
3901  ErrorCode = SettingsFile.SetTo (&SettingsDirectory, g_SettingsFileName,
3902    DoLoad ? B_READ_ONLY : B_READ_WRITE | B_CREATE_FILE | B_ERASE_FILE);
3903  if (ErrorCode != B_OK)
3904  {
3905    sprintf (TempString, "Can't open settings file \"%s\" in directory \"%s\" "
3906      "for %s", g_SettingsFileName, m_SettingsDirectoryPath.Path(),
3907      DoLoad ? "reading" : "writing");
3908    goto ErrorExit;
3909  }
3910
3911  if (DoLoad)
3912  {
3913    ErrorCode = Settings.Unflatten (&SettingsFile);
3914    if (ErrorCode != 0 || Settings.what != g_SettingsWhatCode)
3915    {
3916      sprintf (TempString, "Corrupt data detected while reading settings "
3917        "file \"%s\" in directory \"%s\", will revert to defaults",
3918        g_SettingsFileName, m_SettingsDirectoryPath.Path());
3919      goto ErrorExit;
3920    }
3921  }
3922
3923  /* Transfer the settings between the BMessage and our various global
3924  variables.  For loading, if the setting isn't present, leave it at the
3925  default value.  Note that loading and saving are intermingled here to make
3926  code maintenance easier (less chance of forgetting to update it if load and
3927  save were separate functions). */
3928
3929  ErrorCode = B_OK; /* So that saving settings can record an error. */
3930
3931  NamePntr = "DatabaseFileName";
3932  if (DoLoad)
3933  {
3934    if (Settings.FindString (NamePntr, &StringPntr) == B_OK)
3935      m_DatabaseFileName.SetTo (StringPntr);
3936  }
3937  else if (ErrorCode == B_OK)
3938    ErrorCode = Settings.AddString (NamePntr, m_DatabaseFileName);
3939
3940  NamePntr = "ServerMode";
3941  if (DoLoad)
3942  {
3943    if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3944      g_ServerMode = TempBool;
3945  }
3946  else if (ErrorCode == B_OK)
3947    ErrorCode = Settings.AddBool (NamePntr, g_ServerMode);
3948
3949  NamePntr = "IgnorePreviousClassification";
3950  if (DoLoad)
3951  {
3952    if (Settings.FindBool (NamePntr, &TempBool) == B_OK)
3953      m_IgnorePreviousClassification = TempBool;
3954  }
3955  else if (ErrorCode == B_OK)
3956    ErrorCode = Settings.AddBool (NamePntr, m_IgnorePreviousClassification);
3957
3958  NamePntr = "PurgeAge";
3959  if (DoLoad)
3960  {
3961    if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3962      m_PurgeAge = TempInt32;
3963  }
3964  else if (ErrorCode == B_OK)
3965    ErrorCode = Settings.AddInt32 (NamePntr, m_PurgeAge);
3966
3967  NamePntr = "PurgePopularity";
3968  if (DoLoad)
3969  {
3970    if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3971      m_PurgePopularity = TempInt32;
3972  }
3973  else if (ErrorCode == B_OK)
3974    ErrorCode = Settings.AddInt32 (NamePntr, m_PurgePopularity);
3975
3976  NamePntr = "ScoringMode";
3977  if (DoLoad)
3978  {
3979    if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3980      m_ScoringMode = (ScoringModes) TempInt32;
3981    if (m_ScoringMode < 0 || m_ScoringMode >= SM_MAX)
3982      m_ScoringMode = (ScoringModes) 0;
3983  }
3984  else if (ErrorCode == B_OK)
3985    ErrorCode = Settings.AddInt32 (NamePntr, m_ScoringMode);
3986
3987  NamePntr = "TokenizeMode";
3988  if (DoLoad)
3989  {
3990    if (Settings.FindInt32 (NamePntr, &TempInt32) == B_OK)
3991      m_TokenizeMode = (TokenizeModes) TempInt32;
3992    if (m_TokenizeMode < 0 || m_TokenizeMode >= TM_MAX)
3993      m_TokenizeMode = (TokenizeModes) 0;
3994  }
3995  else if (ErrorCode == B_OK)
3996    ErrorCode = Settings.AddInt32 (NamePntr, m_TokenizeMode);
3997
3998  if (ErrorCode != B_OK)
3999  {
4000    strcpy (TempString, "Unable to stuff the program settings into a "
4001      "temporary BMessage, settings not saved");
4002    goto ErrorExit;
4003  }
4004
4005  /* Save the settings BMessage to the settings file. */
4006
4007  if (!DoLoad)
4008  {
4009    Settings.what = g_SettingsWhatCode;
4010    ErrorCode = Settings.Flatten (&SettingsFile);
4011    if (ErrorCode != 0)
4012    {
4013      sprintf (TempString, "Problems while writing settings file \"%s\" in "
4014        "directory \"%s\"", g_SettingsFileName,
4015        m_SettingsDirectoryPath.Path ());
4016      goto ErrorExit;
4017    }
4018  }
4019
4020  m_SettingsHaveChanged = false;
4021  return B_OK;
4022
4023ErrorExit: /* Error message in TempString, code in ErrorCode. */
4024  DisplayErrorMessage (TempString, ErrorCode, DoLoad ?
4025    "Loading Settings Error" : "Saving Settings Error");
4026  return ErrorCode;
4027}
4028
4029
4030void
4031ABSApp::MessageReceived (BMessage *MessagePntr)
4032{
4033  const char           *PropertyName;
4034  struct property_info *PropInfoPntr;
4035  int32                 SpecifierIndex;
4036  int32                 SpecifierKind;
4037  BMessage              SpecifierMessage;
4038
4039  /* See if it is a scripting message that applies to the database or one of
4040  the other operations this program supports.  Pass on other scripting messages
4041  to the inherited parent MessageReceived function (they're usually scripting
4042  messages for the BApplication). */
4043
4044  switch (MessagePntr->what)
4045  {
4046    case B_GET_PROPERTY:
4047    case B_SET_PROPERTY:
4048    case B_COUNT_PROPERTIES:
4049    case B_CREATE_PROPERTY:
4050    case B_DELETE_PROPERTY:
4051    case B_EXECUTE_PROPERTY:
4052      if (MessagePntr->GetCurrentSpecifier (&SpecifierIndex, &SpecifierMessage,
4053      &SpecifierKind, &PropertyName) == B_OK &&
4054      SpecifierKind == B_DIRECT_SPECIFIER)
4055      {
4056        for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
4057        {
4058          if (PropInfoPntr->name == 0)
4059            break; /* Ran out of commands. */
4060
4061          if (PropInfoPntr->commands[0] == MessagePntr->what &&
4062          strcasecmp (PropInfoPntr->name, PropertyName) == 0)
4063          {
4064            ProcessScriptingMessage (MessagePntr, PropInfoPntr);
4065            return;
4066          }
4067        }
4068      }
4069      break;
4070  }
4071
4072  /* Pass the unprocessed message to the inherited function, maybe it knows
4073  what to do.  This includes replies to messages we sent ourselves. */
4074
4075  BApplication::MessageReceived (MessagePntr);
4076}
4077
4078
4079/* Rename the existing database file to a backup file name, potentially
4080replacing an older backup.  If something goes wrong, returns an error code and
4081puts an explanation in ErrorMessage. */
4082
4083status_t ABSApp::MakeBackup (char *ErrorMessage)
4084{
4085  BEntry   Entry;
4086  status_t ErrorCode;
4087  int      i;
4088  char     LeafName [NAME_MAX];
4089  char     NewName [PATH_MAX+20];
4090  char     OldName [PATH_MAX+20];
4091
4092  ErrorCode = Entry.SetTo (m_DatabaseFileName.String ());
4093  if (ErrorCode != B_OK)
4094  {
4095    sprintf (ErrorMessage, "While making backup, failed to make a BEntry for "
4096      "\"%s\" (maybe the directory doesn't exist?)",
4097      m_DatabaseFileName.String ());
4098    return ErrorCode;
4099  }
4100  if (!Entry.Exists ())
4101    return B_OK; /* No existing file to worry about overwriting. */
4102  Entry.GetName (LeafName);
4103
4104  /* Find the first hole (no file) where we will stop the renaming chain. */
4105
4106  for (i = 0; i < g_MaxBackups - 1; i++)
4107  {
4108    strcpy (OldName, m_DatabaseFileName.String ());
4109    sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4110    Entry.SetTo (OldName);
4111    if (!Entry.Exists ())
4112      break;
4113  }
4114
4115  /* Move the files down by one to fill in the hole in the name series. */
4116
4117  for (i--; i >= 0; i--)
4118  {
4119    strcpy (OldName, m_DatabaseFileName.String ());
4120    sprintf (OldName + strlen (OldName), g_BackupSuffix, i);
4121    Entry.SetTo (OldName);
4122    strcpy (NewName, LeafName);
4123    sprintf (NewName + strlen (NewName), g_BackupSuffix, i + 1);
4124    ErrorCode = Entry.Rename (NewName, true /* clobber */);
4125  }
4126
4127  Entry.SetTo (m_DatabaseFileName.String ());
4128  strcpy (NewName, LeafName);
4129  sprintf (NewName + strlen (NewName), g_BackupSuffix, 0);
4130  ErrorCode = Entry.Rename (NewName, true /* clobber */);
4131  if (ErrorCode != B_OK)
4132    sprintf (ErrorMessage, "While making backup, failed to rename "
4133      "\"%s\" to \"%s\"", m_DatabaseFileName.String (), NewName);
4134
4135  return ErrorCode;
4136}
4137
4138
4139void
4140ABSApp::MakeDatabaseEmpty ()
4141{
4142  m_WordMap.clear (); /* Sets the map to empty, deallocating any old data. */
4143  m_WordCount = 0;
4144  m_TotalGenuineMessages = 0;
4145  m_TotalSpamMessages = 0;
4146  m_OldestAge = (uint32) -1 /* makes largest number possible */;
4147}
4148
4149
4150/* Do what the scripting command says.  A reply message will be sent back with
4151several fields: "error" containing the numerical error code (0 for success),
4152"CommandText" with a text representation of the command, "result" with the
4153resulting data for a get or count command.  If it isn't understood, then rather
4154than a B_REPLY kind of message, it will be a B_MESSAGE_NOT_UNDERSTOOD message
4155with an "error" number and an "message" string with a description. */
4156
4157void
4158ABSApp::ProcessScriptingMessage (
4159  BMessage *MessagePntr,
4160  struct property_info *PropInfoPntr)
4161{
4162  bool        ArgumentBool = false;
4163  bool        ArgumentGotBool = false;
4164  bool        ArgumentGotInt32 = false;
4165  bool        ArgumentGotString = false;
4166  int32       ArgumentInt32 = 0;
4167  const char *ArgumentString = NULL;
4168  BString     CommandText;
4169  status_t    ErrorCode;
4170  int         i;
4171  BMessage    ReplyMessage (B_MESSAGE_NOT_UNDERSTOOD);
4172  ssize_t     StringBufferSize;
4173  BMessage    TempBMessage;
4174  BPath       TempPath;
4175  char        TempString [PATH_MAX + 1024];
4176
4177  if (g_QuitCountdown >= 0 && !g_CommandLineMode)
4178  {
4179    g_QuitCountdown = -1;
4180    cerr << "Quit countdown aborted due to a scripting command arriving.\n";
4181  }
4182
4183  if (g_BusyCursor != NULL)
4184    SetCursor (g_BusyCursor);
4185
4186  ErrorCode = MessagePntr->FindData (g_DataName, B_STRING_TYPE,
4187    (const void **) &ArgumentString, &StringBufferSize);
4188  if (ErrorCode == B_OK)
4189  {
4190    if (PropInfoPntr->extra_data != PN_EVALUATE_STRING &&
4191    PropInfoPntr->extra_data != PN_SPAM_STRING &&
4192    PropInfoPntr->extra_data != PN_GENUINE_STRING &&
4193    strlen (ArgumentString) >= PATH_MAX)
4194    {
4195      sprintf (TempString, "\"data\" string of a scripting message is too "
4196        "long, for SET %s action", PropInfoPntr->name);
4197      ErrorCode = B_NAME_TOO_LONG;
4198      goto ErrorExit;
4199    }
4200    ArgumentGotString = true;
4201  }
4202  else if (MessagePntr->FindBool (g_DataName, &ArgumentBool) == B_OK)
4203    ArgumentGotBool = true;
4204  else if (MessagePntr->FindInt32 (g_DataName, &ArgumentInt32) == B_OK)
4205    ArgumentGotInt32 = true;
4206
4207  /* Prepare a Human readable description of the scripting command. */
4208
4209  switch (PropInfoPntr->commands[0])
4210  {
4211    case B_SET_PROPERTY:
4212      CommandText.SetTo ("Set ");
4213      break;
4214
4215    case B_GET_PROPERTY:
4216      CommandText.SetTo ("Get ");
4217      break;
4218
4219    case B_COUNT_PROPERTIES:
4220      CommandText.SetTo ("Count ");
4221      break;
4222
4223    case B_CREATE_PROPERTY:
4224      CommandText.SetTo ("Create ");
4225      break;
4226
4227    case B_DELETE_PROPERTY:
4228      CommandText.SetTo ("Delete ");
4229      break;
4230
4231    case B_EXECUTE_PROPERTY:
4232      CommandText.SetTo ("Execute ");
4233      break;
4234
4235    default:
4236      sprintf (TempString, "Bug: scripting command for \"%s\" has an unknown "
4237        "action code %d", PropInfoPntr->name,
4238        (int) PropInfoPntr->commands[0]);
4239      ErrorCode = -1;
4240      goto ErrorExit;
4241  }
4242  CommandText.Append (PropInfoPntr->name);
4243
4244  /* Add on the argument value to our readable command, if there is one. */
4245
4246  if (ArgumentGotString)
4247  {
4248    CommandText.Append (" \"");
4249    CommandText.Append (ArgumentString);
4250    CommandText.Append ("\"");
4251  }
4252  if (ArgumentGotBool)
4253    CommandText.Append (ArgumentBool ? " true" : " false");
4254  if (ArgumentGotInt32)
4255  {
4256    sprintf (TempString, " %" B_PRId32, ArgumentInt32);
4257    CommandText.Append (TempString);
4258  }
4259
4260  /* From now on the scripting command has been recognized and is in the
4261  correct format, so it always returns a B_REPLY message.  A readable version
4262  of the command is also added to make debugging easier. */
4263
4264  ReplyMessage.what = B_REPLY;
4265  ReplyMessage.AddString ("CommandText", CommandText);
4266
4267  /* Now actually do the command.  First prepare a default error message. */
4268
4269  sprintf (TempString, "Operation code %d (get, set, count, etc) "
4270    "unsupported for property %s",
4271    (int) PropInfoPntr->commands[0], PropInfoPntr->name);
4272  ErrorCode = B_BAD_INDEX;
4273
4274  switch (PropInfoPntr->extra_data)
4275  {
4276    case PN_DATABASE_FILE:
4277      switch (PropInfoPntr->commands[0])
4278      {
4279        case B_GET_PROPERTY: /* Get the database file name. */
4280          ReplyMessage.AddString (g_ResultName, m_DatabaseFileName);
4281          break;
4282
4283        case B_SET_PROPERTY: /* Set the database file name to a new one. */
4284          if (!ArgumentGotString)
4285          {
4286            ErrorCode = B_BAD_TYPE;
4287            sprintf (TempString, "You need to specify a string for the "
4288              "SET %s command", PropInfoPntr->name);
4289            goto ErrorExit;
4290          }
4291          ErrorCode = TempPath.SetTo (ArgumentString, NULL /* leaf */,
4292            true /* normalize - verifies parent directories exist */);
4293          if (ErrorCode != B_OK)
4294          {
4295            sprintf (TempString, "New database path name of \"%s\" is invalid "
4296              "(parent directories must exist)", ArgumentString);
4297            goto ErrorExit;
4298          }
4299          if ((ErrorCode = SaveDatabaseIfNeeded (TempString)) != B_OK)
4300            goto ErrorExit;
4301          MakeDatabaseEmpty (); /* So that the new one gets loaded if used. */
4302
4303          if (strlen (TempPath.Leaf ()) > NAME_MAX-strlen(g_BackupSuffix)-1)
4304          {
4305            /* Truncate the name so that there is enough space for the backup
4306            extension.  Approximately. */
4307            strcpy (TempString, TempPath.Leaf ());
4308            TempString [NAME_MAX - strlen (g_BackupSuffix) - 1] = 0;
4309            TempPath.GetParent (&TempPath);
4310            TempPath.Append (TempString);
4311          }
4312          m_DatabaseFileName.SetTo (TempPath.Path ());
4313          m_SettingsHaveChanged = true;
4314          break;
4315
4316        case B_CREATE_PROPERTY: /* Make a new database file plus more. */
4317          if ((ErrorCode = CreateDatabaseFile (TempString)) != B_OK)
4318            goto ErrorExit;
4319          break;
4320
4321        case B_DELETE_PROPERTY: /* Delete the file and its backups too. */
4322          if ((ErrorCode = DeleteDatabaseFile (TempString)) != B_OK)
4323            goto ErrorExit;
4324          break;
4325
4326        case B_COUNT_PROPERTIES:
4327          if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4328            goto ErrorExit;
4329          ReplyMessage.AddInt32 (g_ResultName, m_WordCount);
4330          break;
4331
4332        default: /* Unknown operation code, error message already set. */
4333          goto ErrorExit;
4334      }
4335      break;
4336
4337    case PN_SPAM:
4338    case PN_SPAM_STRING:
4339    case PN_GENUINE:
4340    case PN_GENUINE_STRING:
4341    case PN_UNCERTAIN:
4342      switch (PropInfoPntr->commands[0])
4343      {
4344        case B_COUNT_PROPERTIES: /* Get the number of spam/genuine messages. */
4345          if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4346            goto ErrorExit;
4347          if (PropInfoPntr->extra_data == PN_SPAM ||
4348          PropInfoPntr->extra_data == PN_SPAM_STRING)
4349            ReplyMessage.AddInt32 (g_ResultName, m_TotalSpamMessages);
4350          else
4351            ReplyMessage.AddInt32 (g_ResultName, m_TotalGenuineMessages);
4352          break;
4353
4354        case B_SET_PROPERTY: /* Add spam/genuine/uncertain to database. */
4355          if (!ArgumentGotString)
4356          {
4357            ErrorCode = B_BAD_TYPE;
4358            sprintf (TempString, "You need to specify a string (%s) "
4359              "for the SET %s command",
4360              (PropInfoPntr->extra_data == PN_GENUINE_STRING ||
4361              PropInfoPntr->extra_data == PN_SPAM_STRING)
4362              ? "text of the message to be added"
4363              : "pathname of the file containing the text to be added",
4364              PropInfoPntr->name);
4365            goto ErrorExit;
4366          }
4367          if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) != B_OK)
4368            goto ErrorExit;
4369          if (PropInfoPntr->extra_data == PN_GENUINE ||
4370          PropInfoPntr->extra_data == PN_SPAM ||
4371          PropInfoPntr->extra_data == PN_UNCERTAIN)
4372            ErrorCode = AddFileToDatabase (
4373              (PropInfoPntr->extra_data == PN_SPAM) ? CL_SPAM :
4374              ((PropInfoPntr->extra_data == PN_GENUINE) ? CL_GENUINE :
4375              CL_UNCERTAIN),
4376              ArgumentString, TempString /* ErrorMessage */);
4377          else
4378            ErrorCode = AddStringToDatabase (
4379              (PropInfoPntr->extra_data == PN_SPAM_STRING) ?
4380              CL_SPAM : CL_GENUINE,
4381              ArgumentString, TempString /* ErrorMessage */);
4382          if (ErrorCode != B_OK)
4383            goto ErrorExit;
4384          break;
4385
4386        default: /* Unknown operation code, error message already set. */
4387          goto ErrorExit;
4388      }
4389      break;
4390
4391    case PN_IGNORE_PREVIOUS_CLASSIFICATION:
4392      switch (PropInfoPntr->commands[0])
4393      {
4394        case B_GET_PROPERTY:
4395          ReplyMessage.AddBool (g_ResultName, m_IgnorePreviousClassification);
4396          break;
4397
4398        case B_SET_PROPERTY:
4399          if (!ArgumentGotBool)
4400          {
4401            ErrorCode = B_BAD_TYPE;
4402            sprintf (TempString, "You need to specify a boolean (true/yes, "
4403              "false/no) for the SET %s command", PropInfoPntr->name);
4404            goto ErrorExit;
4405          }
4406          m_IgnorePreviousClassification = ArgumentBool;
4407          m_SettingsHaveChanged = true;
4408          break;
4409
4410        default: /* Unknown operation code, error message already set. */
4411          goto ErrorExit;
4412      }
4413      break;
4414
4415    case PN_SERVER_MODE:
4416      switch (PropInfoPntr->commands[0])
4417      {
4418        case B_GET_PROPERTY:
4419          ReplyMessage.AddBool (g_ResultName, g_ServerMode);
4420          break;
4421
4422        case B_SET_PROPERTY:
4423          if (!ArgumentGotBool)
4424          {
4425            ErrorCode = B_BAD_TYPE;
4426            sprintf (TempString, "You need to specify a boolean (true/yes, "
4427              "false/no) for the SET %s command", PropInfoPntr->name);
4428            goto ErrorExit;
4429          }
4430          g_ServerMode = ArgumentBool;
4431          m_SettingsHaveChanged = true;
4432          break;
4433
4434        default: /* Unknown operation code, error message already set. */
4435          goto ErrorExit;
4436      }
4437      break;
4438
4439    case PN_FLUSH:
4440      if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4441      (ErrorCode = SaveDatabaseIfNeeded (TempString)) == B_OK)
4442        break;
4443      goto ErrorExit;
4444
4445    case PN_PURGE_AGE:
4446      switch (PropInfoPntr->commands[0])
4447      {
4448        case B_GET_PROPERTY:
4449          ReplyMessage.AddInt32 (g_ResultName, m_PurgeAge);
4450          break;
4451
4452        case B_SET_PROPERTY:
4453          if (!ArgumentGotInt32)
4454          {
4455            ErrorCode = B_BAD_TYPE;
4456            sprintf (TempString, "You need to specify a 32 bit integer "
4457              "for the SET %s command", PropInfoPntr->name);
4458            goto ErrorExit;
4459          }
4460          m_PurgeAge = ArgumentInt32;
4461          m_SettingsHaveChanged = true;
4462          break;
4463
4464        default: /* Unknown operation code, error message already set. */
4465          goto ErrorExit;
4466      }
4467      break;
4468
4469    case PN_PURGE_POPULARITY:
4470      switch (PropInfoPntr->commands[0])
4471      {
4472        case B_GET_PROPERTY:
4473          ReplyMessage.AddInt32 (g_ResultName, m_PurgePopularity);
4474          break;
4475
4476        case B_SET_PROPERTY:
4477          if (!ArgumentGotInt32)
4478          {
4479            ErrorCode = B_BAD_TYPE;
4480            sprintf (TempString, "You need to specify a 32 bit integer "
4481              "for the SET %s command", PropInfoPntr->name);
4482            goto ErrorExit;
4483          }
4484          m_PurgePopularity = ArgumentInt32;
4485          m_SettingsHaveChanged = true;
4486          break;
4487
4488        default: /* Unknown operation code, error message already set. */
4489          goto ErrorExit;
4490      }
4491      break;
4492
4493    case PN_PURGE:
4494      if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4495      (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK &&
4496      (ErrorCode = PurgeOldWords (TempString)) == B_OK)
4497        break;
4498      goto ErrorExit;
4499
4500    case PN_OLDEST:
4501      if (PropInfoPntr->commands[0] == B_GET_PROPERTY &&
4502      (ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4503      {
4504        ReplyMessage.AddInt32 (g_ResultName, m_OldestAge);
4505        break;
4506      }
4507      goto ErrorExit;
4508
4509    case PN_EVALUATE:
4510    case PN_EVALUATE_STRING:
4511      if (PropInfoPntr->commands[0] == B_SET_PROPERTY)
4512      {
4513        if (!ArgumentGotString)
4514        {
4515          ErrorCode = B_BAD_TYPE;
4516          sprintf (TempString, "You need to specify a string for the "
4517            "SET %s command", PropInfoPntr->name);
4518          goto ErrorExit;
4519        }
4520        if ((ErrorCode = LoadDatabaseIfNeeded (TempString)) == B_OK)
4521        {
4522          if (PropInfoPntr->extra_data == PN_EVALUATE)
4523          {
4524            if ((ErrorCode = EvaluateFile (ArgumentString, &ReplyMessage,
4525            TempString)) == B_OK)
4526              break;
4527          }
4528          else /* PN_EVALUATE_STRING */
4529          {
4530            if ((ErrorCode = EvaluateString (ArgumentString, StringBufferSize,
4531            &ReplyMessage, TempString)) == B_OK)
4532              break;
4533          }
4534        }
4535      }
4536      goto ErrorExit;
4537
4538    case PN_RESET_TO_DEFAULTS:
4539      if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY)
4540      {
4541        DefaultSettings ();
4542        break;
4543      }
4544      goto ErrorExit;
4545
4546    case PN_INSTALL_THINGS:
4547      if (PropInfoPntr->commands[0] == B_EXECUTE_PROPERTY &&
4548      (ErrorCode = InstallThings (TempString)) == B_OK)
4549        break;
4550      goto ErrorExit;
4551
4552    case PN_SCORING_MODE:
4553      switch (PropInfoPntr->commands[0])
4554      {
4555        case B_GET_PROPERTY:
4556          ReplyMessage.AddString (g_ResultName,
4557            g_ScoringModeNames[m_ScoringMode]);
4558          break;
4559
4560        case B_SET_PROPERTY:
4561          i = SM_MAX;
4562          if (ArgumentGotString)
4563            for (i = 0; i < SM_MAX; i++)
4564            {
4565              if (strcasecmp (ArgumentString, g_ScoringModeNames [i]) == 0)
4566              {
4567                m_ScoringMode = (ScoringModes) i;
4568                m_SettingsHaveChanged = true;
4569                break;
4570              }
4571            }
4572          if (i >= SM_MAX) /* Didn't find a valid scoring mode word. */
4573          {
4574            ErrorCode = B_BAD_TYPE;
4575            sprintf (TempString, "You used the unrecognized \"%s\" as "
4576              "a scoring mode for the SET %s command.  Should be one of: ",
4577              ArgumentGotString ? ArgumentString : "not specified",
4578              PropInfoPntr->name);
4579            for (i = 0; i < SM_MAX; i++)
4580            {
4581              strcat (TempString, g_ScoringModeNames [i]);
4582              if (i < SM_MAX - 1)
4583                strcat (TempString, ", ");
4584            }
4585            goto ErrorExit;
4586          }
4587          break;
4588
4589        default: /* Unknown operation code, error message already set. */
4590          goto ErrorExit;
4591      }
4592      break;
4593
4594    case PN_TOKENIZE_MODE:
4595      switch (PropInfoPntr->commands[0])
4596      {
4597        case B_GET_PROPERTY:
4598          ReplyMessage.AddString (g_ResultName,
4599            g_TokenizeModeNames[m_TokenizeMode]);
4600          break;
4601
4602        case B_SET_PROPERTY:
4603          i = TM_MAX;
4604          if (ArgumentGotString)
4605            for (i = 0; i < TM_MAX; i++)
4606            {
4607              if (strcasecmp (ArgumentString, g_TokenizeModeNames [i]) == 0)
4608              {
4609                m_TokenizeMode = (TokenizeModes) i;
4610                m_SettingsHaveChanged = true;
4611                break;
4612              }
4613            }
4614          if (i >= TM_MAX) /* Didn't find a valid tokenize mode word. */
4615          {
4616            ErrorCode = B_BAD_TYPE;
4617            sprintf (TempString, "You used the unrecognized \"%s\" as "
4618              "a tokenize mode for the SET %s command.  Should be one of: ",
4619              ArgumentGotString ? ArgumentString : "not specified",
4620              PropInfoPntr->name);
4621            for (i = 0; i < TM_MAX; i++)
4622            {
4623              strcat (TempString, g_TokenizeModeNames [i]);
4624              if (i < TM_MAX - 1)
4625                strcat (TempString, ", ");
4626            }
4627            goto ErrorExit;
4628          }
4629          break;
4630
4631        default: /* Unknown operation code, error message already set. */
4632          goto ErrorExit;
4633      }
4634      break;
4635
4636    default:
4637      sprintf (TempString, "Bug!  Unrecognized property identification "
4638        "number %d (should be between 0 and %d).  Fix the entry in "
4639        "the g_ScriptingPropertyList array!",
4640        (int) PropInfoPntr->extra_data, PN_MAX - 1);
4641      goto ErrorExit;
4642  }
4643
4644  /* Success. */
4645
4646  ReplyMessage.AddInt32 ("error", B_OK);
4647  ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4648    this /* Reply's reply handler */, 500000 /* send timeout */);
4649  if (ErrorCode != B_OK)
4650    cerr << "ProcessScriptingMessage failed to send a reply message, code " <<
4651    ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4652    CommandText.String () << endl;
4653  SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4654  return;
4655
4656ErrorExit: /* Error message in TempString, return code in ErrorCode. */
4657  ReplyMessage.AddInt32 ("error", ErrorCode);
4658  ReplyMessage.AddString ("message", TempString);
4659  DisplayErrorMessage (TempString, ErrorCode);
4660  ErrorCode = MessagePntr->SendReply (&ReplyMessage,
4661    this /* Reply's reply handler */, 500000 /* send timeout */);
4662  if (ErrorCode != B_OK)
4663    cerr << "ProcessScriptingMessage failed to send an error message, code " <<
4664    ErrorCode << " (" << strerror (ErrorCode) << ")" << " for " <<
4665    CommandText.String () << endl;
4666  SetCursor (B_CURSOR_SYSTEM_DEFAULT);
4667}
4668
4669
4670/* Since quitting stops the program before the results of a script command are
4671received, we use a time delay to do the quit and make sure there are no pending
4672commands being processed by the auxiliary looper which is sending us commands.
4673Also, we have a countdown which can be interrupted by an incoming scripting
4674message in case one client tells us to quit while another one is still using us
4675(happens when you have two or more e-mail accounts).  But if the system is
4676shutting down, quit immediately! */
4677
4678void
4679ABSApp::Pulse ()
4680{
4681  if (g_QuitCountdown == 0)
4682  {
4683    if (g_CommanderLooperPntr == NULL ||
4684    !g_CommanderLooperPntr->IsBusy ())
4685      PostMessage (B_QUIT_REQUESTED);
4686  }
4687  else if (g_QuitCountdown > 0)
4688  {
4689    cerr << "SpamDBM quitting in " << g_QuitCountdown << ".\n";
4690    g_QuitCountdown--;
4691  }
4692}
4693
4694
4695/* A quit request message has come in.  If the quit countdown has reached zero,
4696allow the request, otherwise reject it (and start the countdown if it hasn't
4697been started). */
4698
4699bool
4700ABSApp::QuitRequested ()
4701{
4702  BMessage  *QuitMessage;
4703  team_info  RemoteInfo;
4704  BMessenger RemoteMessenger;
4705  team_id    RemoteTeam;
4706
4707  /* See if the quit is from the system shutdown command (which goes through
4708  the registrar server), if so, quit immediately. */
4709
4710  QuitMessage = CurrentMessage ();
4711  if (QuitMessage != NULL && QuitMessage->IsSourceRemote ())
4712  {
4713    RemoteMessenger = QuitMessage->ReturnAddress ();
4714    RemoteTeam = RemoteMessenger.Team ();
4715    if (get_team_info (RemoteTeam, &RemoteInfo) == B_OK &&
4716    strstr (RemoteInfo.args, "registrar") != NULL)
4717      g_QuitCountdown = 0;
4718  }
4719
4720  if (g_QuitCountdown == 0)
4721    return BApplication::QuitRequested ();
4722
4723  if (g_QuitCountdown < 0)
4724//    g_QuitCountdown = 10; /* Start the countdown. */
4725    g_QuitCountdown = 5; /* Quit more quickly */
4726
4727  return false;
4728}
4729
4730
4731/* Go through the current database and delete words which are too old (time is
4732equivalent to the number of messages added to the database) and too unpopular
4733(words not used by many messages).  Hopefully this will get rid of words which
4734are just hunks of binary or other garbage.  The database has been loaded
4735elsewhere. */
4736
4737status_t
4738ABSApp::PurgeOldWords (char *ErrorMessage)
4739{
4740  uint32                  CurrentTime;
4741  StatisticsMap::iterator CurrentIter;
4742  StatisticsMap::iterator EndIter;
4743  StatisticsMap::iterator NextIter;
4744  char                    TempString [80];
4745
4746  strcpy (ErrorMessage, "Purge can't fail"); /* So argument gets used. */
4747  CurrentTime = m_TotalGenuineMessages + m_TotalSpamMessages - 1;
4748  m_OldestAge = (uint32) -1 /* makes largest number possible */;
4749
4750  EndIter = m_WordMap.end ();
4751  NextIter = m_WordMap.begin ();
4752  while (NextIter != EndIter) {
4753    CurrentIter = NextIter++;
4754
4755    if (CurrentTime - CurrentIter->second.age >= m_PurgeAge &&
4756    CurrentIter->second.genuineCount + CurrentIter->second.spamCount <=
4757    m_PurgePopularity) {
4758      /* Delete this word, it is unpopular and old.  Sob. */
4759
4760      m_WordMap.erase (CurrentIter);
4761      if (m_WordCount > 0)
4762        m_WordCount--;
4763
4764      m_DatabaseHasChanged = true;
4765    }
4766    else /* This word is still in the database.  Update oldest age. */
4767    {
4768      if (CurrentIter->second.age < m_OldestAge)
4769        m_OldestAge = CurrentIter->second.age;
4770    }
4771  }
4772
4773  /* Just a little bug check here.  Just in case. */
4774
4775  if (m_WordCount != m_WordMap.size ()) {
4776    sprintf (TempString, "Our word count of %" B_PRIu32 " doesn't match the "
4777      "size of the database, %lu", m_WordCount, m_WordMap.size());
4778    DisplayErrorMessage (TempString, -1, "Bug!");
4779    m_WordCount = m_WordMap.size ();
4780  }
4781
4782  return B_OK;
4783}
4784
4785
4786void
4787ABSApp::ReadyToRun ()
4788{
4789  DatabaseWindow *DatabaseWindowPntr;
4790  float           JunkFloat;
4791  BButton        *TempButtonPntr;
4792  BCheckBox      *TempCheckBoxPntr;
4793  font_height     TempFontHeight;
4794  BMenuBar       *TempMenuBarPntr;
4795  BMenuItem      *TempMenuItemPntr;
4796  BPopUpMenu     *TempPopUpMenuPntr;
4797  BRadioButton   *TempRadioButtonPntr;
4798  BRect           TempRect;
4799  const char     *TempString = "Testing My Things";
4800  BStringView    *TempStringViewPntr;
4801  BTextControl   *TempTextPntr;
4802  BWindow        *TempWindowPntr;
4803
4804  /* This batch of code gets some measurements which will be used for laying
4805  out controls and other GUI elements.  Set the spacing between buttons and
4806  other controls to the width of the letter "M" in the user's desired font. */
4807
4808 g_MarginBetweenControls = (int) be_plain_font->StringWidth ("M");
4809
4810  /* Also find out how much space a line of text uses. */
4811
4812  be_plain_font->GetHeight (&TempFontHeight);
4813  g_LineOfTextHeight = ceilf (
4814    TempFontHeight.ascent + TempFontHeight.descent + TempFontHeight.leading);
4815
4816  /* Start finding out the height of various user interface gadgets, which can
4817  vary based on the current font size.  Make a temporary gadget, which is
4818  attached to our window, then resize it to its prefered size so that it
4819  accomodates the font size and other frills it needs. */
4820
4821  TempWindowPntr = new (std::nothrow) BWindow (BRect (10, 20, 200, 200),
4822	"Temporary Window", B_DOCUMENT_WINDOW,
4823	B_NO_WORKSPACE_ACTIVATION | B_ASYNCHRONOUS_CONTROLS);
4824  if (TempWindowPntr == NULL) {
4825    DisplayErrorMessage ("Unable to create temporary window for finding "
4826      "sizes of controls.");
4827    g_QuitCountdown = 0;
4828    return;
4829  }
4830
4831  TempRect = TempWindowPntr->Bounds ();
4832
4833  /* Find the height of a single line of text in a BStringView. */
4834
4835  TempStringViewPntr = new (std::nothrow) BStringView (TempRect, TempString, TempString);
4836  if (TempStringViewPntr != NULL) {
4837    TempWindowPntr->Lock();
4838    TempWindowPntr->AddChild (TempStringViewPntr);
4839    TempStringViewPntr->GetPreferredSize (&JunkFloat, &g_StringViewHeight);
4840    TempWindowPntr->RemoveChild (TempStringViewPntr);
4841    TempWindowPntr->Unlock();
4842    delete TempStringViewPntr;
4843  }
4844
4845  /* Find the height of a button, which seems to be larger than a text
4846  control and can make life difficult.  Make a temporary button, which
4847  is attached to our window so that it resizes to accomodate the font size. */
4848
4849  TempButtonPntr = new (std::nothrow) BButton (TempRect, TempString, TempString, NULL);
4850  if (TempButtonPntr != NULL) {
4851    TempWindowPntr->Lock();
4852    TempWindowPntr->AddChild (TempButtonPntr);
4853    TempButtonPntr->GetPreferredSize (&JunkFloat, &g_ButtonHeight);
4854    TempWindowPntr->RemoveChild (TempButtonPntr);
4855    TempWindowPntr->Unlock();
4856    delete TempButtonPntr;
4857  }
4858
4859  /* Find the height of a text box. */
4860
4861  TempTextPntr = new (std::nothrow) BTextControl (TempRect, TempString, NULL /* label */,
4862    TempString, NULL);
4863  if (TempTextPntr != NULL) {
4864    TempWindowPntr->Lock ();
4865    TempWindowPntr->AddChild (TempTextPntr);
4866    TempTextPntr->GetPreferredSize (&JunkFloat, &g_TextBoxHeight);
4867    TempWindowPntr->RemoveChild (TempTextPntr);
4868    TempWindowPntr->Unlock ();
4869    delete TempTextPntr;
4870  }
4871
4872  /* Find the height of a checkbox control. */
4873
4874  TempCheckBoxPntr = new (std::nothrow) BCheckBox (TempRect, TempString, TempString, NULL);
4875  if (TempCheckBoxPntr != NULL) {
4876    TempWindowPntr->Lock ();
4877    TempWindowPntr->AddChild (TempCheckBoxPntr);
4878    TempCheckBoxPntr->GetPreferredSize (&JunkFloat, &g_CheckBoxHeight);
4879    TempWindowPntr->RemoveChild (TempCheckBoxPntr);
4880    TempWindowPntr->Unlock ();
4881    delete TempCheckBoxPntr;
4882  }
4883
4884  /* Find the height of a radio button control. */
4885
4886  TempRadioButtonPntr =
4887    new (std::nothrow) BRadioButton (TempRect, TempString, TempString, NULL);
4888  if (TempRadioButtonPntr != NULL) {
4889    TempWindowPntr->Lock ();
4890    TempWindowPntr->AddChild (TempRadioButtonPntr);
4891    TempRadioButtonPntr->GetPreferredSize (&JunkFloat, &g_RadioButtonHeight);
4892    TempWindowPntr->RemoveChild (TempRadioButtonPntr);
4893    TempWindowPntr->Unlock ();
4894    delete TempRadioButtonPntr;
4895  }
4896
4897  /* Find the height of a pop-up menu. */
4898
4899  TempMenuBarPntr = new (std::nothrow) BMenuBar (TempRect, TempString,
4900    B_FOLLOW_LEFT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
4901    true /* resize to fit items */);
4902  TempPopUpMenuPntr = new (std::nothrow) BPopUpMenu (TempString);
4903  TempMenuItemPntr = new (std::nothrow) BMenuItem (TempString, new BMessage (12345), 'g');
4904
4905  if (TempMenuBarPntr != NULL && TempPopUpMenuPntr != NULL &&
4906  TempMenuItemPntr != NULL)
4907  {
4908    TempPopUpMenuPntr->AddItem (TempMenuItemPntr);
4909    TempMenuBarPntr->AddItem (TempPopUpMenuPntr);
4910
4911    TempWindowPntr->Lock ();
4912    TempWindowPntr->AddChild (TempMenuBarPntr);
4913    TempMenuBarPntr->GetPreferredSize (&JunkFloat, &g_PopUpMenuHeight);
4914    TempWindowPntr->RemoveChild (TempMenuBarPntr);
4915    TempWindowPntr->Unlock ();
4916    delete TempMenuBarPntr; // It will delete contents too.
4917  }
4918
4919  TempWindowPntr->Lock ();
4920  TempWindowPntr->Quit ();
4921
4922  SetPulseRate (500000);
4923
4924  if (g_CommandLineMode)
4925    g_QuitCountdown = 0; /* Quit as soon as queued up commands done. */
4926  else /* GUI mode, make a window. */
4927  {
4928    DatabaseWindowPntr = new (std::nothrow) DatabaseWindow ();
4929    if (DatabaseWindowPntr == NULL) {
4930      DisplayErrorMessage ("Unable to create window.");
4931      g_QuitCountdown = 0;
4932    } else {
4933      DatabaseWindowPntr->Show (); /* Starts the window's message loop. */
4934    }
4935  }
4936
4937  g_AppReadyToRunCompleted = true;
4938}
4939
4940
4941/* Given a mail component (body text, attachment, whatever), look for words in
4942it.  If the tokenize mode specifies that it isn't one of the ones we are
4943looking for, just skip it.  For container type components, recursively examine
4944their contents, up to the maximum depth specified. */
4945
4946status_t
4947ABSApp::RecursivelyTokenizeMailComponent (
4948  BMailComponent *ComponentPntr,
4949  const char *OptionalFileName,
4950  set<string> &WordSet,
4951  char *ErrorMessage,
4952  int RecursionLevel,
4953  int MaxRecursionLevel)
4954{
4955  char                        AttachmentName [B_FILE_NAME_LENGTH];
4956  BMailAttachment            *AttachmentPntr;
4957  BMimeType                   ComponentMIMEType;
4958  BMailContainer             *ContainerPntr;
4959  BMallocIO                   ContentsIO;
4960  const char                 *ContentsBufferPntr;
4961  size_t                      ContentsBufferSize;
4962  status_t                    ErrorCode;
4963  bool                        ExamineComponent;
4964  const char                 *HeaderKeyPntr;
4965  const char                 *HeaderValuePntr;
4966  int                         i;
4967  int                         j;
4968  const char                 *NameExtension;
4969  int                         NumComponents;
4970  BMimeType                   TextAnyMIMEType ("text");
4971  BMimeType                   TextPlainMIMEType ("text/plain");
4972
4973  if (ComponentPntr == NULL)
4974    return B_OK;
4975
4976  /* Add things in the sub-headers that might be useful.  Things like the file
4977  name of attachments, the encoding type, etc. */
4978
4979  if (m_TokenizeMode == TM_PLAIN_TEXT_HEADER ||
4980  m_TokenizeMode == TM_ANY_TEXT_HEADER ||
4981  m_TokenizeMode == TM_ALL_PARTS_HEADER ||
4982  m_TokenizeMode == TM_JUST_HEADER)
4983  {
4984    for (i = 0; i < 1000; i++)
4985    {
4986      HeaderKeyPntr = ComponentPntr->HeaderAt (i);
4987      if (HeaderKeyPntr == NULL)
4988        break;
4989      AddWordsToSet (HeaderKeyPntr, strlen (HeaderKeyPntr),
4990        'H' /* Prefix for Headers, uppercase unlike normal words. */, WordSet);
4991      for (j = 0; j < 1000; j++)
4992      {
4993        HeaderValuePntr = ComponentPntr->HeaderField (HeaderKeyPntr, j);
4994        if (HeaderValuePntr == NULL)
4995          break;
4996        AddWordsToSet (HeaderValuePntr, strlen (HeaderValuePntr),
4997          'H', WordSet);
4998      }
4999    }
5000  }
5001
5002  /* Check the MIME type of the thing.  It's used to decide if the contents are
5003  worth examining for words. */
5004
5005  ErrorCode = ComponentPntr->MIMEType (&ComponentMIMEType);
5006  if (ErrorCode != B_OK)
5007  {
5008    sprintf (ErrorMessage, "ABSApp::RecursivelyTokenizeMailComponent: "
5009      "Unable to get MIME type at level %d in \"%s\"",
5010      RecursionLevel, OptionalFileName);
5011    return ErrorCode;
5012  }
5013  if (ComponentMIMEType.Type() == NULL)
5014  {
5015    /* Have to make up a MIME type for things which don't have them, such as
5016    the main body text, otherwise it would get ignored. */
5017
5018    if (NULL != dynamic_cast<BTextMailComponent *>(ComponentPntr))
5019      ComponentMIMEType.SetType ("text/plain");
5020  }
5021  if (!TextAnyMIMEType.Contains (&ComponentMIMEType) &&
5022  NULL != (AttachmentPntr = dynamic_cast<BMailAttachment *>(ComponentPntr)))
5023  {
5024    /* Sometimes spam doesn't give a text MIME type for text when they do an
5025    attachment (which is often base64 encoded).  Use the file name extension to
5026    see if it really is text. */
5027    NameExtension = NULL;
5028    if (AttachmentPntr->FileName (AttachmentName) >= 0)
5029      NameExtension = strrchr (AttachmentName, '.');
5030    if (NameExtension != NULL)
5031    {
5032      if (strcasecmp (NameExtension, ".txt") == 0)
5033        ComponentMIMEType.SetType ("text/plain");
5034      else if (strcasecmp (NameExtension, ".htm") == 0 ||
5035      strcasecmp (NameExtension, ".html") == 0)
5036        ComponentMIMEType.SetType ("text/html");
5037    }
5038  }
5039
5040  switch (m_TokenizeMode)
5041  {
5042    case TM_PLAIN_TEXT:
5043    case TM_PLAIN_TEXT_HEADER:
5044      ExamineComponent = TextPlainMIMEType.Contains (&ComponentMIMEType);
5045      break;
5046
5047    case TM_ANY_TEXT:
5048    case TM_ANY_TEXT_HEADER:
5049      ExamineComponent = TextAnyMIMEType.Contains (&ComponentMIMEType);
5050      break;
5051
5052    case TM_ALL_PARTS:
5053    case TM_ALL_PARTS_HEADER:
5054      ExamineComponent = true;
5055      break;
5056
5057    default:
5058      ExamineComponent = false;
5059      break;
5060  }
5061
5062  if (ExamineComponent)
5063  {
5064    /* Get the contents of the component.  This will be UTF-8 text (converted
5065    from whatever encoding was used) for text attachments.  For other ones,
5066    it's just the raw data, or perhaps decoded from base64 encoding. */
5067
5068    ContentsIO.SetBlockSize (16 * 1024);
5069    ErrorCode = ComponentPntr->GetDecodedData (&ContentsIO);
5070    if (ErrorCode == B_OK) /* Can fail for container components: no data. */
5071    {
5072      /* Look for words in the decoded data. */
5073
5074      ContentsBufferPntr = (const char *) ContentsIO.Buffer ();
5075      ContentsBufferSize = ContentsIO.BufferLength ();
5076      if (ContentsBufferPntr != NULL /* can be empty */)
5077        AddWordsToSet (ContentsBufferPntr, ContentsBufferSize,
5078          0 /* no prefix character, this is body text */, WordSet);
5079    }
5080  }
5081
5082  /* Examine any sub-components in the message. */
5083
5084  if (RecursionLevel + 1 <= MaxRecursionLevel &&
5085  NULL != (ContainerPntr = dynamic_cast<BMailContainer *>(ComponentPntr)))
5086  {
5087    NumComponents = ContainerPntr->CountComponents ();
5088
5089    for (i = 0; i < NumComponents; i++)
5090    {
5091      ComponentPntr = ContainerPntr->GetComponent (i);
5092
5093      ErrorCode = RecursivelyTokenizeMailComponent (ComponentPntr,
5094        OptionalFileName, WordSet, ErrorMessage, RecursionLevel + 1,
5095        MaxRecursionLevel);
5096      if (ErrorCode != B_OK)
5097        break;
5098    }
5099  }
5100
5101  return ErrorCode;
5102}
5103
5104
5105/* The user has tried to open a file or several files with this application,
5106via Tracker's open-with menu item.  If it is a database type file, then change
5107the database file name to it.  Otherwise, ask the user whether they want to
5108classify it as spam or non-spam.  There will be at most around 100 files, BeOS
5109R5.0.3's Tracker crashes if it tries to pass on more than that many using Open
5110With... etc.  The command is sent to an intermediary thread where it is
5111asynchronously converted into a scripting message(s) that are sent back to this
5112BApplication.  The intermediary is needed since we can't recursively execute
5113scripting messages while processing a message (this RefsReceived one). */
5114
5115void
5116ABSApp::RefsReceived (BMessage *MessagePntr)
5117{
5118  if (g_CommanderLooperPntr != NULL)
5119    g_CommanderLooperPntr->CommandReferences (MessagePntr);
5120}
5121
5122
5123/* A scripting command is looking for something to execute it.  See if it is
5124targetted at our database. */
5125
5126BHandler * ABSApp::ResolveSpecifier (
5127  BMessage *MessagePntr,
5128  int32 Index,
5129  BMessage *SpecifierMsgPntr,
5130  int32 SpecificationKind,
5131  const char *PropertyPntr)
5132{
5133  int i;
5134
5135  /* See if it is one of our commands. */
5136
5137  if (SpecificationKind == B_DIRECT_SPECIFIER)
5138  {
5139    for (i = PN_MAX - 1; i >= 0; i--)
5140    {
5141      if (strcasecmp (PropertyPntr, g_PropertyNames [i]) == 0)
5142        return this; /* Found it!  Return the Handler (which is us). */
5143    }
5144  }
5145
5146  /* Handle an unrecognized scripting command, let the parent figure it out. */
5147
5148  return BApplication::ResolveSpecifier (
5149    MessagePntr, Index, SpecifierMsgPntr, SpecificationKind, PropertyPntr);
5150}
5151
5152
5153/* Save the database if it hasn't been saved yet.  Otherwise do nothing. */
5154
5155status_t ABSApp::SaveDatabaseIfNeeded (char *ErrorMessage)
5156{
5157  if (m_DatabaseHasChanged)
5158    return LoadSaveDatabase (false /* DoLoad */, ErrorMessage);
5159
5160  return B_OK;
5161}
5162
5163
5164/* Presumably the file is an e-mail message (or at least the header portion of
5165one).  Break it into parts: header, body and MIME components.  Then add the
5166words in the portions that match the current tokenization settings to the set
5167of words. */
5168
5169status_t ABSApp::TokenizeParts (
5170  BPositionIO *PositionIOPntr,
5171  const char *OptionalFileName,
5172  set<string> &WordSet,
5173  char *ErrorMessage)
5174{
5175  status_t        ErrorCode = B_OK;
5176  BEmailMessage   WholeEMail;
5177
5178  sprintf (ErrorMessage, "ABSApp::TokenizeParts: While getting e-mail "
5179    "headers, had problems with \"%s\"", OptionalFileName);
5180
5181  ErrorCode = WholeEMail.SetToRFC822 (
5182    PositionIOPntr /* it does its own seeking to the start */,
5183    -1 /* length */, true /* parse_now */);
5184  if (ErrorCode < 0) goto ErrorExit;
5185
5186  ErrorCode = RecursivelyTokenizeMailComponent (&WholeEMail,
5187    OptionalFileName, WordSet, ErrorMessage, 0 /* Initial recursion level */,
5188    (m_TokenizeMode == TM_JUST_HEADER) ? 0 : 500 /* Max recursion level */);
5189
5190ErrorExit:
5191  return ErrorCode;
5192}
5193
5194
5195/* Add all the words in the whole file or memory buffer to the supplied set.
5196The file doesn't have to be an e-mail message since it isn't parsed for e-mail
5197headers or MIME headers or anything.  It blindly adds everything that looks
5198like a word, though it does convert quoted printable codes to the characters
5199they represent.  See also AddWordsToSet which does something more advanced. */
5200
5201status_t ABSApp::TokenizeWhole (
5202  BPositionIO *PositionIOPntr,
5203  const char *OptionalFileName,
5204  set<string> &WordSet,
5205  char *ErrorMessage)
5206{
5207  string                AccumulatedWord;
5208  uint8                 Buffer [16 * 1024];
5209  uint8                *BufferCurrentPntr = Buffer + 0;
5210  uint8                *BufferEndPntr = Buffer + 0;
5211  const char           *IOErrorString =
5212                          "TokenizeWhole: Error %ld while reading \"%s\"";
5213  size_t                Length;
5214  int                   Letter = ' ';
5215  char                  HexString [4];
5216  int                   NextLetter = ' ';
5217  int                   NextNextLetter = ' ';
5218
5219  /* Use a buffer since reading single characters from a BFile is so slow.
5220  BufferCurrentPntr is the position of the next character to be read.  When it
5221  reaches BufferEndPntr, it is time to fill the buffer again. */
5222
5223#define ReadChar(CharVar) \
5224  { \
5225    if (BufferCurrentPntr < BufferEndPntr) \
5226      CharVar = *BufferCurrentPntr++; \
5227    else /* Try to fill the buffer. */ \
5228    { \
5229      ssize_t AmountRead; \
5230      AmountRead = PositionIOPntr->Read (Buffer, sizeof (Buffer)); \
5231      if (AmountRead < 0) \
5232      { \
5233        sprintf (ErrorMessage, IOErrorString, AmountRead, OptionalFileName); \
5234        return AmountRead; \
5235      } \
5236      else if (AmountRead == 0) \
5237        CharVar = EOF; \
5238      else \
5239      { \
5240        BufferEndPntr = Buffer + AmountRead; \
5241        BufferCurrentPntr = Buffer + 0; \
5242        CharVar = *BufferCurrentPntr++; \
5243      } \
5244    } \
5245  }
5246
5247  /* Read all the words in the file and add them to our local set of words.  A
5248  set is used since we don't care how many times a word occurs. */
5249
5250  while (true)
5251  {
5252    /* We read two letters ahead so that we can decode quoted printable
5253    characters (an equals sign followed by two hex digits or a new line).  Note
5254    that Letter can become EOF (-1) when end of file is reached. */
5255
5256    Letter = NextLetter;
5257    NextLetter = NextNextLetter;
5258    ReadChar (NextNextLetter);
5259
5260    /* Decode quoted printable codes first, so that the rest of the code just
5261    sees an ordinary character.  Or even nothing, if it is the hidden line
5262    break combination.  This may falsely corrupt stuff following an equals
5263    sign, but usually won't. */
5264
5265    if (Letter == '=')
5266    {
5267      if ((NextLetter == '\r' && NextNextLetter == '\n') ||
5268      (NextLetter == '\n' && NextNextLetter == '\r'))
5269      {
5270        /* Make the "=\r\n" pair disappear.  It's not even white space. */
5271        ReadChar (NextLetter);
5272        ReadChar (NextNextLetter);
5273        continue;
5274      }
5275      if (NextLetter == '\n' || NextLetter == '\r')
5276      {
5277        /* Make the "=\n" pair disappear.  It's not even white space. */
5278        NextLetter = NextNextLetter;
5279        ReadChar (NextNextLetter);
5280        continue;
5281      }
5282      if (NextNextLetter != EOF &&
5283      isxdigit (NextLetter) && isxdigit (NextNextLetter))
5284      {
5285        /* Convert the hex code to a letter. */
5286        HexString[0] = NextLetter;
5287        HexString[1] = NextNextLetter;
5288        HexString[2] = 0;
5289        Letter = strtoul (HexString, NULL, 16 /* number system base */);
5290        ReadChar (NextLetter);
5291        ReadChar (NextNextLetter);
5292      }
5293    }
5294
5295    /* Convert to lower case to improve word matches.  Of course this loses a
5296    bit of information, such as MONEY vs Money, an indicator of spam.  Well,
5297    apparently that isn't all that useful a distinction, so do it. */
5298
5299    if (Letter >= 'A' && Letter < 'Z')
5300      Letter = Letter + ('a' - 'A');
5301
5302    /* See if it is a letter we treat as white space - all control characters
5303    and all punctuation except for: apostrophe (so "it's" and possessive
5304    versions of words get stored), dash (for hyphenated words), dollar sign
5305    (for cash amounts), period (for IP addresses, we later remove trailing
5306    (periods).  Note that codes above 127 are UTF-8 characters, which we
5307    consider non-space. */
5308
5309    if (Letter < 0 /* EOF */ || (Letter < 128 && g_SpaceCharacters[Letter]))
5310    {
5311      /* That space finished off a word.  Remove trailing periods... */
5312
5313      while ((Length = AccumulatedWord.size()) > 0 &&
5314      AccumulatedWord [Length-1] == '.')
5315        AccumulatedWord.resize (Length - 1);
5316
5317      /* If there's anything left in the word, add it to the set.  Also ignore
5318      words which are too big (it's probably some binary encoded data).  But
5319      leave room for supercalifragilisticexpialidoceous.  According to one web
5320      site, pneumonoultramicroscopicsilicovolcanoconiosis is the longest word
5321      currently in English.  Note that some uuencoded data was seen with a 60
5322      character line length. */
5323
5324      if (Length > 0 && Length <= g_MaxWordLength)
5325        WordSet.insert (AccumulatedWord);
5326
5327      /* Empty out the string to get ready for the next word. */
5328
5329      AccumulatedWord.resize (0);
5330    }
5331    else /* Not a space-like character, add it to the word. */
5332      AccumulatedWord.append (1 /* one copy of the char */, (char) Letter);
5333
5334    /* Stop at end of file or error.  Don't care which.  Exit here so that last
5335    word got processed. */
5336
5337    if (Letter == EOF)
5338      break;
5339  }
5340
5341  return B_OK;
5342}
5343
5344
5345
5346/******************************************************************************
5347 * Implementation of the ClassificationChoicesView class, constructor,
5348 * destructor and the rest of the member functions in mostly alphabetical
5349 * order.
5350 */
5351
5352ClassificationChoicesWindow::ClassificationChoicesWindow (
5353  BRect FrameRect,
5354  const char *FileName,
5355  int NumberOfFiles)
5356: BWindow (FrameRect, "Classification Choices", B_TITLED_WINDOW,
5357    B_NOT_ZOOMABLE | B_NOT_RESIZABLE | B_ASYNCHRONOUS_CONTROLS),
5358  m_BulkModeSelectedPntr (NULL),
5359  m_ChoosenClassificationPntr (NULL)
5360{
5361  ClassificationChoicesView *SubViewPntr;
5362
5363  SubViewPntr = new ClassificationChoicesView (Bounds(),
5364    FileName, NumberOfFiles);
5365  AddChild (SubViewPntr);
5366  SubViewPntr->ResizeToPreferred ();
5367  ResizeTo (SubViewPntr->Frame().Width(), SubViewPntr->Frame().Height());
5368}
5369
5370
5371void
5372ClassificationChoicesWindow::MessageReceived (BMessage *MessagePntr)
5373{
5374  BControl *ControlPntr;
5375
5376  if (MessagePntr->what >= MSG_CLASS_BUTTONS &&
5377  MessagePntr->what < MSG_CLASS_BUTTONS + CL_MAX)
5378  {
5379    if (m_ChoosenClassificationPntr != NULL)
5380      *m_ChoosenClassificationPntr =
5381        (ClassificationTypes) (MessagePntr->what - MSG_CLASS_BUTTONS);
5382    PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5383    return;
5384  }
5385
5386  if (MessagePntr->what == MSG_BULK_CHECKBOX)
5387  {
5388    if (m_BulkModeSelectedPntr != NULL &&
5389    MessagePntr->FindPointer ("source", (void **) &ControlPntr) == B_OK)
5390      *m_BulkModeSelectedPntr = (ControlPntr->Value() == B_CONTROL_ON);
5391    return;
5392  }
5393
5394  if (MessagePntr->what == MSG_CANCEL_BUTTON)
5395  {
5396    PostMessage (B_QUIT_REQUESTED); // Close and destroy the window.
5397    return;
5398  }
5399
5400  BWindow::MessageReceived (MessagePntr);
5401}
5402
5403
5404void
5405ClassificationChoicesWindow::Go (
5406  bool *BulkModeSelectedPntr,
5407  ClassificationTypes *ChoosenClassificationPntr)
5408{
5409  status_t  ErrorCode = 0;
5410  BView    *MainViewPntr;
5411  thread_id WindowThreadID;
5412
5413  m_BulkModeSelectedPntr = BulkModeSelectedPntr;
5414  m_ChoosenClassificationPntr = ChoosenClassificationPntr;
5415  if (m_ChoosenClassificationPntr != NULL)
5416    *m_ChoosenClassificationPntr = CL_MAX;
5417
5418  Show (); // Starts the window thread running.
5419
5420  /* Move the window to the center of the screen it is now being displayed on
5421  (have to wait for it to be showing). */
5422
5423  Lock ();
5424  MainViewPntr = FindView ("ClassificationChoicesView");
5425  if (MainViewPntr != NULL)
5426  {
5427    BRect   TempRect;
5428    BScreen TempScreen (this);
5429    float   X;
5430    float   Y;
5431
5432    TempRect = TempScreen.Frame ();
5433    X = TempRect.Width() / 2;
5434    Y = TempRect.Height() / 2;
5435    TempRect = MainViewPntr->Frame();
5436    X -= TempRect.Width() / 2;
5437    Y -= TempRect.Height() / 2;
5438    MoveTo (ceilf (X), ceilf (Y));
5439  }
5440  Unlock ();
5441
5442  /* Wait for the window to go away. */
5443
5444  WindowThreadID = Thread ();
5445  if (WindowThreadID >= 0)
5446    // Delay until the window thread has died, presumably window deleted now.
5447    wait_for_thread (WindowThreadID, &ErrorCode);
5448}
5449
5450
5451
5452/******************************************************************************
5453 * Implementation of the ClassificationChoicesView class, constructor,
5454 * destructor and the rest of the member functions in mostly alphabetical
5455 * order.
5456 */
5457
5458ClassificationChoicesView::ClassificationChoicesView (
5459  BRect FrameRect,
5460  const char *FileName,
5461  int NumberOfFiles)
5462: BView (FrameRect, "ClassificationChoicesView",
5463    B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_NAVIGABLE_JUMP),
5464  m_FileName (FileName),
5465  m_NumberOfFiles (NumberOfFiles),
5466  m_PreferredBottomY (ceilf (g_ButtonHeight * 10))
5467{
5468}
5469
5470
5471void
5472ClassificationChoicesView::AttachedToWindow ()
5473{
5474  BButton            *ButtonPntr;
5475  BCheckBox          *CheckBoxPntr;
5476  ClassificationTypes Classification;
5477  float               Margin;
5478  float               RowHeight;
5479  float               RowTop;
5480  BTextView          *TextViewPntr;
5481  BRect               TempRect;
5482  char                TempString [2048];
5483  BRect               TextRect;
5484  float               X;
5485
5486  SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5487
5488  RowHeight = g_ButtonHeight;
5489  if (g_CheckBoxHeight > RowHeight)
5490    RowHeight = g_CheckBoxHeight;
5491  RowHeight = ceilf (RowHeight * 1.1);
5492
5493  TempRect = Bounds ();
5494  RowTop = TempRect.top;
5495
5496  /* Show the file name text. */
5497
5498  Margin = ceilf ((RowHeight - g_StringViewHeight) / 2);
5499  TempRect = Bounds ();
5500  TempRect.top = RowTop + Margin;
5501  TextRect = TempRect;
5502  TextRect.OffsetTo (0, 0);
5503  TextRect.InsetBy (g_MarginBetweenControls, 2);
5504  sprintf (TempString, "How do you want to classify the file named \"%s\"?",
5505    m_FileName);
5506  TextViewPntr = new BTextView (TempRect, "FileText", TextRect,
5507    B_FOLLOW_TOP | B_FOLLOW_LEFT, B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE);
5508  AddChild (TextViewPntr);
5509  TextViewPntr->SetText (TempString);
5510  TextViewPntr->MakeEditable (false);
5511  TextViewPntr->SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
5512  TextViewPntr->ResizeTo (TempRect.Width (),
5513    3 + TextViewPntr->TextHeight (0, sizeof (TempString)));
5514  RowTop = TextViewPntr->Frame().bottom + Margin;
5515
5516  /* Make the classification buttons. */
5517
5518  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5519  TempRect = Bounds ();
5520  TempRect.top = RowTop + Margin;
5521  X = Bounds().left + g_MarginBetweenControls;
5522  for (Classification = (ClassificationTypes) 0; Classification < CL_MAX;
5523  Classification = (ClassificationTypes) ((int) Classification + 1))
5524  {
5525    TempRect = Bounds ();
5526    TempRect.top = RowTop + Margin;
5527    TempRect.left = X;
5528    sprintf (TempString, "%s Button",
5529      g_ClassificationTypeNames [Classification]);
5530    ButtonPntr = new BButton (TempRect, TempString,
5531      g_ClassificationTypeNames [Classification], new BMessage (
5532      ClassificationChoicesWindow::MSG_CLASS_BUTTONS + Classification));
5533    AddChild (ButtonPntr);
5534    ButtonPntr->ResizeToPreferred ();
5535    X = ButtonPntr->Frame().right + 3 * g_MarginBetweenControls;
5536  }
5537  RowTop += ceilf (RowHeight * 1.2);
5538
5539  /* Make the Cancel button. */
5540
5541  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
5542  TempRect = Bounds ();
5543  TempRect.top = RowTop + Margin;
5544  TempRect.left += g_MarginBetweenControls;
5545  ButtonPntr = new BButton (TempRect, "Cancel Button",
5546    "Cancel", new BMessage (ClassificationChoicesWindow::MSG_CANCEL_BUTTON));
5547  AddChild (ButtonPntr);
5548  ButtonPntr->ResizeToPreferred ();
5549  X = ButtonPntr->Frame().right + g_MarginBetweenControls;
5550
5551  /* Make the checkbox for bulk operations. */
5552
5553  if (m_NumberOfFiles > 1)
5554  {
5555    Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
5556    TempRect = Bounds ();
5557    TempRect.top = RowTop + Margin;
5558    TempRect.left = X;
5559    sprintf (TempString, "Mark all %d remaining messages the same way.",
5560      m_NumberOfFiles - 1);
5561    CheckBoxPntr = new BCheckBox (TempRect, "BulkBox", TempString,
5562      new BMessage (ClassificationChoicesWindow::MSG_BULK_CHECKBOX));
5563    AddChild (CheckBoxPntr);
5564    CheckBoxPntr->ResizeToPreferred ();
5565  }
5566  RowTop += RowHeight;
5567
5568  m_PreferredBottomY = RowTop;
5569}
5570
5571
5572void
5573ClassificationChoicesView::GetPreferredSize (float *width, float *height)
5574{
5575  if (width != NULL)
5576    *width = Bounds().Width();
5577  if (height != NULL)
5578    *height = m_PreferredBottomY;
5579}
5580
5581
5582
5583/******************************************************************************
5584 * Implementation of the CommanderLooper class, constructor, destructor and the
5585 * rest of the member functions in mostly alphabetical order.
5586 */
5587
5588CommanderLooper::CommanderLooper ()
5589: BLooper ("CommanderLooper", B_NORMAL_PRIORITY),
5590  m_IsBusy (false)
5591{
5592}
5593
5594
5595CommanderLooper::~CommanderLooper ()
5596{
5597  g_CommanderLooperPntr = NULL;
5598  delete g_CommanderMessenger;
5599  g_CommanderMessenger = NULL;
5600}
5601
5602
5603/* Process some command line arguments.  Basically just send a message to this
5604looper itself to do the work later.  That way the caller can continue doing
5605whatever they're doing, particularly if it's the BApplication. */
5606
5607void
5608CommanderLooper::CommandArguments (int argc, char **argv)
5609{
5610  int      i;
5611  BMessage InternalMessage;
5612
5613  InternalMessage.what = MSG_COMMAND_ARGUMENTS;
5614  for (i = 0; i < argc; i++)
5615    InternalMessage.AddString ("arg", argv[i]);
5616
5617  PostMessage (&InternalMessage);
5618}
5619
5620
5621/* Copy the refs out of the given message and stuff them into an internal
5622message to ourself (so that the original message can be returned to the caller,
5623and if it is Tracker, it can close the file handles it has open).  Optionally
5624allow preset classification rather than asking the user (set BulkMode to TRUE
5625and specify the class with BulkClassification). */
5626
5627void
5628CommanderLooper::CommandReferences (
5629  BMessage *MessagePntr,
5630  bool BulkMode,
5631  ClassificationTypes BulkClassification)
5632{
5633  entry_ref EntryRef;
5634  int       i;
5635  BMessage  InternalMessage;
5636
5637  InternalMessage.what = MSG_COMMAND_FILE_REFS;
5638  for (i = 0; MessagePntr->FindRef ("refs", i, &EntryRef) == B_OK; i++)
5639    InternalMessage.AddRef ("refs", &EntryRef);
5640  InternalMessage.AddBool ("BulkMode", BulkMode);
5641  InternalMessage.AddInt32 ("BulkClassification", BulkClassification);
5642
5643  PostMessage (&InternalMessage);
5644}
5645
5646
5647/* This function is called by other threads to see if the CommanderLooper is
5648busy working on something. */
5649
5650bool
5651CommanderLooper::IsBusy ()
5652{
5653  if (m_IsBusy)
5654    return true;
5655
5656  if (IsLocked () || !MessageQueue()->IsEmpty ())
5657    return true;
5658
5659  return false;
5660}
5661
5662
5663void
5664
5665CommanderLooper::MessageReceived (BMessage *MessagePntr)
5666{
5667  m_IsBusy = true;
5668
5669  if (MessagePntr->what == MSG_COMMAND_ARGUMENTS)
5670    ProcessArgs (MessagePntr);
5671  else if (MessagePntr->what == MSG_COMMAND_FILE_REFS)
5672    ProcessRefs (MessagePntr);
5673  else
5674    BLooper::MessageReceived (MessagePntr);
5675
5676  m_IsBusy = false;
5677}
5678
5679
5680/* Process the command line by converting it into a series of scripting
5681messages (possibly thousands) and sent them to the BApplication synchronously
5682(so we can print the result). */
5683
5684void
5685CommanderLooper::ProcessArgs (BMessage *MessagePntr)
5686{
5687  int32                 argc = 0;
5688  const char          **argv = NULL;
5689  int                   ArgumentIndex;
5690  uint32                CommandCode;
5691  const char           *CommandWord;
5692  status_t              ErrorCode;
5693  const char           *ErrorTitle = "ProcessArgs";
5694  char                 *EndPntr;
5695  int32                 i;
5696  BMessage              ReplyMessage;
5697  BMessage              ScriptMessage;
5698  struct property_info *PropInfoPntr;
5699  const char           *PropertyName;
5700  bool                  TempBool;
5701  float                 TempFloat;
5702  int32                 TempInt32;
5703  const char           *TempStringPntr;
5704  type_code             TypeCode;
5705  const char           *ValuePntr;
5706
5707  /* Get the argument count and pointers to arguments out of the message and
5708  into our argc and argv. */
5709
5710  ErrorCode = MessagePntr->GetInfo ("arg", &TypeCode, &argc);
5711  if (ErrorCode != B_OK || TypeCode != B_STRING_TYPE)
5712  {
5713    DisplayErrorMessage ("Unable to find argument strings in message",
5714      ErrorCode, ErrorTitle);
5715    goto ErrorExit;
5716  }
5717
5718  if (argc < 2)
5719  {
5720    cerr << PrintUsage;
5721    DisplayErrorMessage ("You need to specify a command word, like GET, SET "
5722      "and so on followed by a property, like DatabaseFile, and maybe "
5723      "followed by a value of some sort", -1, ErrorTitle);
5724    goto ErrorExit;
5725  }
5726
5727  argv = (const char **) malloc (sizeof (char *) * argc);
5728  if (argv == NULL)
5729  {
5730    DisplayErrorMessage ("Out of memory when allocating argv array",
5731      ENOMEM, ErrorTitle);
5732    goto ErrorExit;
5733  }
5734
5735  for (i = 0; i < argc; i++)
5736  {
5737    if ((ErrorCode = MessagePntr->FindString ("arg", i, &argv[i])) != B_OK)
5738    {
5739      DisplayErrorMessage ("Unable to find argument in the BMessage",
5740        ErrorCode, ErrorTitle);
5741      goto ErrorExit;
5742    }
5743  }
5744
5745  CommandWord = argv[1];
5746
5747  /* Special case for the Quit command since it isn't a scripting command. */
5748
5749  if (strcasecmp (CommandWord, "quit") == 0)
5750  {
5751    g_QuitCountdown = 10;
5752    goto ErrorExit;
5753  }
5754
5755  /* Find the corresponding scripting command. */
5756
5757  if (strcasecmp (CommandWord, "set") == 0)
5758    CommandCode = B_SET_PROPERTY;
5759  else if (strcasecmp (CommandWord, "get") == 0)
5760    CommandCode = B_GET_PROPERTY;
5761  else if (strcasecmp (CommandWord, "count") == 0)
5762    CommandCode = B_COUNT_PROPERTIES;
5763  else if (strcasecmp (CommandWord, "create") == 0)
5764    CommandCode = B_CREATE_PROPERTY;
5765  else if (strcasecmp (CommandWord, "delete") == 0)
5766    CommandCode = B_DELETE_PROPERTY;
5767  else
5768    CommandCode = B_EXECUTE_PROPERTY;
5769
5770  if (CommandCode == B_EXECUTE_PROPERTY)
5771  {
5772    PropertyName = CommandWord;
5773    ArgumentIndex = 2; /* Arguments to the command start at this index. */
5774  }
5775  else
5776  {
5777    if (CommandCode == B_SET_PROPERTY)
5778    {
5779      /* SET commands require at least one argument value. */
5780      if (argc < 4)
5781      {
5782        cerr << PrintUsage;
5783        DisplayErrorMessage ("SET commands require at least one "
5784          "argument value after the property name", -1, ErrorTitle);
5785        goto ErrorExit;
5786      }
5787    }
5788    else
5789      if (argc < 3)
5790      {
5791        cerr << PrintUsage;
5792        DisplayErrorMessage ("You need to specify a property to act on",
5793          -1, ErrorTitle);
5794        goto ErrorExit;
5795      }
5796    PropertyName = argv[2];
5797    ArgumentIndex = 3;
5798  }
5799
5800  /* See if it is one of our commands. */
5801
5802  for (PropInfoPntr = g_ScriptingPropertyList + 0; true; PropInfoPntr++)
5803  {
5804    if (PropInfoPntr->name == 0)
5805    {
5806      cerr << PrintUsage;
5807      DisplayErrorMessage ("The property specified isn't known or "
5808        "doesn't support the requested action (usually means it is an "
5809        "unknown command)", -1, ErrorTitle);
5810      goto ErrorExit; /* Unrecognized command. */
5811    }
5812
5813    if (PropInfoPntr->commands[0] == CommandCode &&
5814    strcasecmp (PropertyName, PropInfoPntr->name) == 0)
5815      break;
5816  }
5817
5818  /* Make the equivalent command message.  For commands with multiple
5819  arguments, repeat the message for each single argument and just change the
5820  data portion for each extra argument.  Send the command and wait for a reply,
5821  which we'll print out. */
5822
5823  ScriptMessage.MakeEmpty ();
5824  ScriptMessage.what = CommandCode;
5825  ScriptMessage.AddSpecifier (PropertyName);
5826  while (true)
5827  {
5828    if (ArgumentIndex < argc) /* If there are arguments to be added. */
5829    {
5830      ValuePntr = argv[ArgumentIndex];
5831
5832      /* Convert the value into the likely kind of data. */
5833
5834      if (strcasecmp (ValuePntr, "yes") == 0 ||
5835      strcasecmp (ValuePntr, "true") == 0)
5836        ScriptMessage.AddBool (g_DataName, true);
5837      else if (strcasecmp (ValuePntr, "no") == 0 ||
5838      strcasecmp (ValuePntr, "false") == 0)
5839        ScriptMessage.AddBool (g_DataName, false);
5840      else
5841      {
5842        /* See if it is a number. */
5843        i = strtol (ValuePntr, &EndPntr, 0);
5844        if (*EndPntr == 0)
5845          ScriptMessage.AddInt32 (g_DataName, i);
5846        else /* Nope, it's just a string. */
5847          ScriptMessage.AddString (g_DataName, ValuePntr);
5848      }
5849    }
5850
5851    ErrorCode = be_app_messenger.SendMessage (&ScriptMessage, &ReplyMessage);
5852    if (ErrorCode != B_OK)
5853    {
5854      DisplayErrorMessage ("Unable to send scripting command",
5855        ErrorCode, ErrorTitle);
5856      goto ErrorExit;
5857    }
5858
5859    /* Print the reply to the scripting command.  Even in server mode.  To
5860    standard output. */
5861
5862    if (ReplyMessage.FindString ("CommandText", &TempStringPntr) == B_OK)
5863    {
5864      TempInt32 = -1;
5865      if (ReplyMessage.FindInt32 ("error", &TempInt32) == B_OK &&
5866      TempInt32 == B_OK)
5867      {
5868        /* It's a successful reply to one of our scripting messages.  Print out
5869        the returned values code for command line users to see. */
5870
5871        cout << "Result of command to " << TempStringPntr << " is:\t";
5872        if (ReplyMessage.FindString (g_ResultName, &TempStringPntr) == B_OK)
5873          cout << "\"" << TempStringPntr << "\"";
5874        else if (ReplyMessage.FindInt32 (g_ResultName, &TempInt32) == B_OK)
5875          cout << TempInt32;
5876        else if (ReplyMessage.FindFloat (g_ResultName, &TempFloat) == B_OK)
5877          cout << TempFloat;
5878        else if (ReplyMessage.FindBool (g_ResultName, &TempBool) == B_OK)
5879          cout << (TempBool ? "true" : "false");
5880        else
5881          cout << "just plain success";
5882        if (ReplyMessage.FindInt32 ("count", &TempInt32) == B_OK)
5883          cout << "\t(count " << TempInt32 << ")";
5884        for (i = 0; (i < 50) &&
5885        ReplyMessage.FindString ("words", i, &TempStringPntr) == B_OK &&
5886        ReplyMessage.FindFloat ("ratios", i, &TempFloat) == B_OK;
5887        i++)
5888        {
5889          if (i == 0)
5890            cout << "\twith top words:\t";
5891          else
5892            cout << "\t";
5893          cout << TempStringPntr << "/" << TempFloat;
5894        }
5895        cout << endl;
5896      }
5897      else /* An error reply, print out the error, even in server mode. */
5898      {
5899        cout << "Failure of command " << TempStringPntr << ", error ";
5900        cout << TempInt32 << " (" << strerror (TempInt32) << ")";
5901        if (ReplyMessage.FindString ("message", &TempStringPntr) == B_OK)
5902          cout << ", message: " << TempStringPntr;
5903        cout << "." << endl;
5904      }
5905    }
5906
5907    /* Advance to the next argument and its scripting message. */
5908
5909    ScriptMessage.RemoveName (g_DataName);
5910    if (++ArgumentIndex >= argc)
5911      break;
5912  }
5913
5914ErrorExit:
5915  free (argv);
5916}
5917
5918
5919/* Given a bunch of references to files, open the files.  If it's a database
5920file, switch to using it as a database.  Otherwise, treat them as text files
5921and add them to the database.  Prompt the user for the spam or genuine or
5922uncertain (declassification) choice, with the option to bulk mark many files at
5923once. */
5924
5925void
5926CommanderLooper::ProcessRefs (BMessage *MessagePntr)
5927{
5928  bool                         BulkMode = false;
5929  ClassificationTypes          BulkClassification = CL_GENUINE;
5930  ClassificationChoicesWindow *ChoiceWindowPntr;
5931  BEntry                       Entry;
5932  entry_ref                    EntryRef;
5933  status_t                     ErrorCode;
5934  const char                  *ErrorTitle = "CommanderLooper::ProcessRefs";
5935  int32                        NumberOfRefs = 0;
5936  BPath                        Path;
5937  int                          RefIndex;
5938  BMessage                     ReplyMessage;
5939  BMessage                     ScriptingMessage;
5940  bool                         TempBool;
5941  BFile                        TempFile;
5942  int32                        TempInt32;
5943  char                         TempString [PATH_MAX + 1024];
5944  type_code                    TypeCode;
5945
5946  // Wait for ReadyToRun to finish initializing the globals with the sizes of
5947  // the controls, since they are needed when we show the custom alert box for
5948  // choosing the message type.
5949
5950  TempInt32 = 0;
5951  while (!g_AppReadyToRunCompleted && TempInt32++ < 10)
5952    snooze (200000);
5953
5954  ErrorCode = MessagePntr->GetInfo ("refs", &TypeCode, &NumberOfRefs);
5955  if (ErrorCode != B_OK || TypeCode != B_REF_TYPE || NumberOfRefs <= 0)
5956  {
5957    DisplayErrorMessage ("Unable to get refs from the message",
5958      ErrorCode, ErrorTitle);
5959    return;
5960  }
5961
5962  if (MessagePntr->FindBool ("BulkMode", &TempBool) == B_OK)
5963    BulkMode = TempBool;
5964  if (MessagePntr->FindInt32 ("BulkClassification", &TempInt32) == B_OK &&
5965  TempInt32 >= 0 && TempInt32 < CL_MAX)
5966    BulkClassification = (ClassificationTypes) TempInt32;
5967
5968  for (RefIndex = 0;
5969  MessagePntr->FindRef ("refs", RefIndex, &EntryRef) == B_OK;
5970  RefIndex++)
5971  {
5972    ScriptingMessage.MakeEmpty ();
5973    ScriptingMessage.what = 0; /* Haven't figured out what to do yet. */
5974
5975    /* See if the entry is a valid file or directory or other thing. */
5976
5977    ErrorCode = Entry.SetTo (&EntryRef, true /* traverse symbolic links */);
5978    if (ErrorCode != B_OK ||
5979    ((ErrorCode = /* assignment */ B_ENTRY_NOT_FOUND) != 0 /* this pacifies
5980    mwcc -nwhitehorn */ && !Entry.Exists ()) ||
5981    ((ErrorCode = Entry.GetPath (&Path)) != B_OK))
5982    {
5983      DisplayErrorMessage ("Bad entry reference encountered, will skip it",
5984        ErrorCode, ErrorTitle);
5985      BulkMode = false;
5986      continue; /* Bad file reference, try the next one. */
5987    }
5988
5989    /* If it's a file, check if it is a spam database file.  Go by the magic
5990    text at the start of the file, in case someone has edited the file with a
5991    spreadsheet or other tool and lost the MIME type. */
5992
5993    if (Entry.IsFile ())
5994    {
5995      ErrorCode = TempFile.SetTo (&Entry, B_READ_ONLY);
5996      if (ErrorCode != B_OK)
5997      {
5998        sprintf (TempString, "Unable to open file \"%s\" for reading, will "
5999          "skip it", Path.Path ());
6000        DisplayErrorMessage (TempString, ErrorCode, ErrorTitle);
6001        BulkMode = false;
6002        continue;
6003      }
6004      if (TempFile.Read (TempString, strlen (g_DatabaseRecognitionString)) ==
6005      (int) strlen (g_DatabaseRecognitionString) && strncmp (TempString,
6006      g_DatabaseRecognitionString, strlen (g_DatabaseRecognitionString)) == 0)
6007      {
6008        ScriptingMessage.what = B_SET_PROPERTY;
6009        ScriptingMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6010        ScriptingMessage.AddString (g_DataName, Path.Path ());
6011      }
6012      TempFile.Unset ();
6013    }
6014
6015    /* Not a database file.  Could be a directory or a file.  Submit it as
6016    something to be marked spam or genuine. */
6017
6018    if (ScriptingMessage.what == 0)
6019    {
6020      if (!Entry.IsFile ())
6021      {
6022        sprintf (TempString, "\"%s\" is not a file, can't do anything with it",
6023          Path.Path ());
6024        DisplayErrorMessage (TempString, -1, ErrorTitle);
6025        BulkMode = false;
6026        continue;
6027      }
6028
6029      if (!BulkMode) /* Have to ask the user. */
6030      {
6031        ChoiceWindowPntr = new ClassificationChoicesWindow (
6032          BRect (40, 40, 40 + 50 * g_MarginBetweenControls,
6033          40 + g_ButtonHeight * 5), Path.Path (), NumberOfRefs - RefIndex);
6034        ChoiceWindowPntr->Go (&BulkMode, &BulkClassification);
6035        if (BulkClassification == CL_MAX)
6036          break; /* Cancel was picked. */
6037      }
6038
6039      /* Format the command for classifying the file. */
6040
6041      ScriptingMessage.what = B_SET_PROPERTY;
6042
6043      if (BulkClassification == CL_GENUINE)
6044        ScriptingMessage.AddSpecifier (g_PropertyNames[PN_GENUINE]);
6045      else if (BulkClassification == CL_SPAM)
6046        ScriptingMessage.AddSpecifier (g_PropertyNames[PN_SPAM]);
6047      else if (BulkClassification == CL_UNCERTAIN)
6048        ScriptingMessage.AddSpecifier (g_PropertyNames[PN_UNCERTAIN]);
6049      else /* Broken code */
6050        break;
6051      ScriptingMessage.AddString (g_DataName, Path.Path ());
6052    }
6053
6054    /* Tell the BApplication to do the work, and wait for it to finish.  The
6055    BApplication will display any error messages for us. */
6056
6057    ErrorCode =
6058      be_app_messenger.SendMessage (&ScriptingMessage, &ReplyMessage);
6059    if (ErrorCode != B_OK)
6060    {
6061      DisplayErrorMessage ("Unable to send scripting command",
6062        ErrorCode, ErrorTitle);
6063      return;
6064    }
6065
6066    /* If there was an error, allow the user to stop by switching off bulk
6067    mode.  The message will already have been displayed in an alert box, if
6068    server mode is off. */
6069
6070    if (ReplyMessage.FindInt32 ("error", &TempInt32) != B_OK ||
6071    TempInt32 != B_OK)
6072      BulkMode = false;
6073  }
6074}
6075
6076
6077
6078/******************************************************************************
6079 * Implementation of the ControlsView class, constructor, destructor and the
6080 * rest of the member functions in mostly alphabetical order.
6081 */
6082
6083ControlsView::ControlsView (BRect NewBounds)
6084: BView (NewBounds, "ControlsView", B_FOLLOW_TOP | B_FOLLOW_LEFT_RIGHT,
6085    B_WILL_DRAW | B_PULSE_NEEDED | B_NAVIGABLE_JUMP | B_FRAME_EVENTS),
6086  m_AboutButtonPntr (NULL),
6087  m_AddExampleButtonPntr (NULL),
6088  m_BrowseButtonPntr (NULL),
6089  m_BrowseFilePanelPntr (NULL),
6090  m_CreateDatabaseButtonPntr (NULL),
6091  m_DatabaseFileNameTextboxPntr (NULL),
6092  m_DatabaseLoadDone (false),
6093  m_EstimateSpamButtonPntr (NULL),
6094  m_EstimateSpamFilePanelPntr (NULL),
6095  m_GenuineCountTextboxPntr (NULL),
6096  m_IgnorePreviousClassCheckboxPntr (NULL),
6097  m_InstallThingsButtonPntr (NULL),
6098  m_PurgeAgeTextboxPntr (NULL),
6099  m_PurgeButtonPntr (NULL),
6100  m_PurgePopularityTextboxPntr (NULL),
6101  m_ResetToDefaultsButtonPntr (NULL),
6102  m_ScoringModeMenuBarPntr (NULL),
6103  m_ScoringModePopUpMenuPntr (NULL),
6104  m_ServerModeCheckboxPntr (NULL),
6105  m_SpamCountTextboxPntr (NULL),
6106  m_TimeOfLastPoll (0),
6107  m_TokenizeModeMenuBarPntr (NULL),
6108  m_TokenizeModePopUpMenuPntr (NULL),
6109  m_WordCountTextboxPntr (NULL)
6110{
6111}
6112
6113
6114ControlsView::~ControlsView ()
6115{
6116  if (m_BrowseFilePanelPntr != NULL)
6117  {
6118    delete m_BrowseFilePanelPntr;
6119    m_BrowseFilePanelPntr = NULL;
6120  }
6121
6122  if (m_EstimateSpamFilePanelPntr != NULL)
6123  {
6124    delete m_EstimateSpamFilePanelPntr;
6125    m_EstimateSpamFilePanelPntr = NULL;
6126  }
6127}
6128
6129
6130void
6131ControlsView::AttachedToWindow ()
6132{
6133  float         BigPurgeButtonTop;
6134  BMessage      CommandMessage;
6135  const char   *EightDigitsString = " 12345678 ";
6136  float         Height;
6137  float         Margin;
6138  float         RowHeight;
6139  float         RowTop;
6140  ScoringModes  ScoringMode;
6141  const char   *StringPntr;
6142  BMenuItem    *TempMenuItemPntr;
6143  BRect         TempRect;
6144  char          TempString [PATH_MAX];
6145  TokenizeModes TokenizeMode;
6146  float         Width;
6147  float         X;
6148
6149  SetViewColor (ui_color (B_PANEL_BACKGROUND_COLOR));
6150
6151  TempRect = Bounds ();
6152  X = TempRect.right;
6153  RowTop = TempRect.top;
6154  RowHeight = g_ButtonHeight;
6155  if (g_TextBoxHeight > RowHeight)
6156    RowHeight = g_TextBoxHeight;
6157  RowHeight = ceilf (RowHeight * 1.1);
6158
6159  /* Make the Create button at the far right of the first row of controls,
6160  which are all database file related. */
6161
6162  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6163  TempRect = Bounds ();
6164  TempRect.top = RowTop + Margin;
6165  TempRect.bottom = TempRect.top + g_ButtonHeight;
6166
6167  CommandMessage.MakeEmpty ();
6168  CommandMessage.what = B_CREATE_PROPERTY;
6169  CommandMessage.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6170  m_CreateDatabaseButtonPntr = new BButton (TempRect, "Create Button",
6171    "Create", new BMessage (CommandMessage), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6172  if (m_CreateDatabaseButtonPntr == NULL) goto ErrorExit;
6173  AddChild (m_CreateDatabaseButtonPntr);
6174  m_CreateDatabaseButtonPntr->SetTarget (be_app);
6175  m_CreateDatabaseButtonPntr->ResizeToPreferred ();
6176  m_CreateDatabaseButtonPntr->GetPreferredSize (&Width, &Height);
6177  m_CreateDatabaseButtonPntr->MoveTo (X - Width, TempRect.top);
6178  X -= Width + g_MarginBetweenControls;
6179
6180  /* Make the Browse button, middle of the first row. */
6181
6182  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6183  TempRect = Bounds ();
6184  TempRect.top = RowTop + Margin;
6185  TempRect.bottom = TempRect.top + g_ButtonHeight;
6186
6187  m_BrowseButtonPntr = new BButton (TempRect, "Browse Button",
6188    "Browse���", new BMessage (MSG_BROWSE_BUTTON), B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6189  if (m_BrowseButtonPntr == NULL) goto ErrorExit;
6190  AddChild (m_BrowseButtonPntr);
6191  m_BrowseButtonPntr->SetTarget (this);
6192  m_BrowseButtonPntr->ResizeToPreferred ();
6193  m_BrowseButtonPntr->GetPreferredSize (&Width, &Height);
6194  m_BrowseButtonPntr->MoveTo (X - Width, TempRect.top);
6195  X -= Width + g_MarginBetweenControls;
6196
6197  /* Fill the rest of the space on the first row with the file name box. */
6198
6199  Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6200  TempRect = Bounds ();
6201  TempRect.top = RowTop + Margin;
6202  TempRect.bottom = TempRect.top + g_TextBoxHeight;
6203  TempRect.right = X;
6204
6205  StringPntr = "Word Database:";
6206  strcpy (m_DatabaseFileNameCachedValue, "Unknown...");
6207  m_DatabaseFileNameTextboxPntr = new BTextControl (TempRect,
6208    "File Name",
6209    StringPntr /* label */,
6210    m_DatabaseFileNameCachedValue /* text */,
6211    new BMessage (MSG_DATABASE_NAME),
6212    B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6213    B_WILL_DRAW | B_NAVIGABLE | B_NAVIGABLE_JUMP);
6214  AddChild (m_DatabaseFileNameTextboxPntr);
6215  m_DatabaseFileNameTextboxPntr->SetTarget (this);
6216  m_DatabaseFileNameTextboxPntr->SetDivider (
6217    be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6218
6219  /* Second row contains the purge age, and a long line explaining it.  There
6220  is space to the right where the top half of the big purge button will go. */
6221
6222  RowTop += RowHeight /* previous row's RowHeight */;
6223  BigPurgeButtonTop = RowTop;
6224  TempRect = Bounds ();
6225  X = TempRect.left;
6226  RowHeight = g_TextBoxHeight;
6227  RowHeight = ceilf (RowHeight * 1.1);
6228
6229  StringPntr = "Number of occurrences needed to store a word:";
6230  m_PurgeAgeCachedValue = 12345678;
6231
6232  Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6233  TempRect.top = RowTop + Margin;
6234  TempRect.bottom = TempRect.top + g_TextBoxHeight;
6235  TempRect.left = X;
6236  TempRect.right = TempRect.left +
6237    be_plain_font->StringWidth (StringPntr) +
6238    be_plain_font->StringWidth (EightDigitsString) +
6239    3 * g_MarginBetweenControls;
6240
6241  sprintf (TempString, "%d", (int) m_PurgeAgeCachedValue);
6242  m_PurgeAgeTextboxPntr = new BTextControl (TempRect,
6243    "Purge Age",
6244    StringPntr /* label */,
6245    TempString /* text */,
6246    new BMessage (MSG_PURGE_AGE),
6247    B_FOLLOW_LEFT | B_FOLLOW_TOP,
6248    B_WILL_DRAW | B_NAVIGABLE);
6249  AddChild (m_PurgeAgeTextboxPntr);
6250  m_PurgeAgeTextboxPntr->SetTarget (this);
6251  m_PurgeAgeTextboxPntr->SetDivider (
6252    be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6253
6254  /* Third row contains the purge popularity and bottom half of the purge
6255  button. */
6256
6257  RowTop += RowHeight /* previous row's RowHeight */;
6258  TempRect = Bounds ();
6259  X = TempRect.left;
6260  RowHeight = g_TextBoxHeight;
6261  RowHeight = ceilf (RowHeight * 1.1);
6262
6263  StringPntr = "Number of messages to store words from:";
6264  m_PurgePopularityCachedValue = 87654321;
6265  Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6266  TempRect.top = RowTop + Margin;
6267  TempRect.bottom = TempRect.top + g_TextBoxHeight;
6268  TempRect.left = X;
6269  TempRect.right = TempRect.left +
6270    be_plain_font->StringWidth (StringPntr) +
6271    be_plain_font->StringWidth (EightDigitsString) +
6272    3 * g_MarginBetweenControls;
6273  X = TempRect.right + g_MarginBetweenControls;
6274
6275  sprintf (TempString, "%d", (int) m_PurgePopularityCachedValue);
6276  m_PurgePopularityTextboxPntr = new BTextControl (TempRect,
6277    "Purge Popularity",
6278    StringPntr /* label */,
6279    TempString /* text */,
6280    new BMessage (MSG_PURGE_POPULARITY),
6281    B_FOLLOW_LEFT | B_FOLLOW_TOP,
6282    B_WILL_DRAW | B_NAVIGABLE);
6283  AddChild (m_PurgePopularityTextboxPntr);
6284  m_PurgePopularityTextboxPntr->SetTarget (this);
6285  m_PurgePopularityTextboxPntr->SetDivider (
6286    be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6287
6288  /* Make the purge button, which will take up space in the 2nd and 3rd rows,
6289  on the right side.  Twice as tall as a regular button too. */
6290
6291  StringPntr = "Remove Old Words";
6292  Margin = ceilf ((((RowTop + RowHeight) - BigPurgeButtonTop) -
6293    2 * g_TextBoxHeight) / 2);
6294  TempRect.top = BigPurgeButtonTop + Margin;
6295  TempRect.bottom = TempRect.top + 2 * g_TextBoxHeight;
6296  TempRect.left = X;
6297  TempRect.right = X + ceilf (2 * be_plain_font->StringWidth (StringPntr));
6298
6299  CommandMessage.MakeEmpty ();
6300  CommandMessage.what = B_EXECUTE_PROPERTY;
6301  CommandMessage.AddSpecifier (g_PropertyNames[PN_PURGE]);
6302  m_PurgeButtonPntr = new BButton (TempRect, "Purge Button",
6303    StringPntr, new BMessage (CommandMessage), B_FOLLOW_LEFT | B_FOLLOW_TOP);
6304  if (m_PurgeButtonPntr == NULL) goto ErrorExit;
6305  m_PurgeButtonPntr->ResizeToPreferred();
6306  AddChild (m_PurgeButtonPntr);
6307  m_PurgeButtonPntr->SetTarget (be_app);
6308
6309  /* The fourth row contains the ignore previous classification checkbox. */
6310
6311  RowTop += RowHeight /* previous row's RowHeight */;
6312  TempRect = Bounds ();
6313  X = TempRect.left;
6314  RowHeight = g_CheckBoxHeight;
6315  RowHeight = ceilf (RowHeight * 1.1);
6316
6317  StringPntr = "Allow Retraining on a Message";
6318  m_IgnorePreviousClassCachedValue = false;
6319
6320  Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6321  TempRect.top = RowTop + Margin;
6322  TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6323  TempRect.left = X;
6324  m_IgnorePreviousClassCheckboxPntr = new BCheckBox (TempRect,
6325    "Ignore Check",
6326    StringPntr,
6327    new BMessage (MSG_IGNORE_CLASSIFICATION),
6328    B_FOLLOW_TOP | B_FOLLOW_LEFT);
6329  if (m_IgnorePreviousClassCheckboxPntr == NULL) goto ErrorExit;
6330  AddChild (m_IgnorePreviousClassCheckboxPntr);
6331  m_IgnorePreviousClassCheckboxPntr->SetTarget (this);
6332  m_IgnorePreviousClassCheckboxPntr->ResizeToPreferred ();
6333  m_IgnorePreviousClassCheckboxPntr->GetPreferredSize (&Width, &Height);
6334  X += Width + g_MarginBetweenControls;
6335
6336  /* The fifth row contains the server mode checkbox. */
6337
6338  RowTop += RowHeight /* previous row's RowHeight */;
6339  TempRect = Bounds ();
6340  RowHeight = g_CheckBoxHeight;
6341  RowHeight = ceilf (RowHeight * 1.1);
6342
6343  StringPntr = "Print errors to Terminal";
6344  m_ServerModeCachedValue = false;
6345
6346  Margin = ceilf ((RowHeight - g_CheckBoxHeight) / 2);
6347  TempRect.top = RowTop + Margin;
6348  TempRect.bottom = TempRect.top + g_CheckBoxHeight;
6349  m_ServerModeCheckboxPntr = new BCheckBox (TempRect,
6350    "ServerMode Check",
6351    StringPntr,
6352    new BMessage (MSG_SERVER_MODE),
6353    B_FOLLOW_TOP | B_FOLLOW_LEFT);
6354  if (m_ServerModeCheckboxPntr == NULL) goto ErrorExit;
6355  AddChild (m_ServerModeCheckboxPntr);
6356  m_ServerModeCheckboxPntr->SetTarget (this);
6357  m_ServerModeCheckboxPntr->ResizeToPreferred ();
6358  m_ServerModeCheckboxPntr->GetPreferredSize (&Width, &Height);
6359
6360  /* This row just contains a huge pop-up menu which shows the tokenize mode
6361  and an explanation of what each mode does. */
6362
6363  RowTop += RowHeight /* previous row's RowHeight */;
6364  TempRect = Bounds ();
6365  RowHeight = g_PopUpMenuHeight;
6366  RowHeight = ceilf (RowHeight * 1.1);
6367
6368  Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6369  TempRect.top = RowTop + Margin;
6370  TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6371
6372  m_TokenizeModeCachedValue = TM_MAX; /* Illegal value will force redraw. */
6373  m_TokenizeModeMenuBarPntr = new BMenuBar (TempRect, "TokenizeModeMenuBar",
6374    B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6375    false /* resize to fit items */);
6376  if (m_TokenizeModeMenuBarPntr == NULL) goto ErrorExit;
6377  m_TokenizeModePopUpMenuPntr = new BPopUpMenu ("TokenizeModePopUpMenu");
6378  if (m_TokenizeModePopUpMenuPntr == NULL) goto ErrorExit;
6379
6380  for (TokenizeMode = (TokenizeModes) 0;
6381  TokenizeMode < TM_MAX;
6382  TokenizeMode = (TokenizeModes) ((int) TokenizeMode + 1))
6383  {
6384    /* Each different tokenize mode gets its own menu item.  Selecting the item
6385    will send a canned command to the application to switch to the appropriate
6386    tokenize mode.  An optional explanation of each mode is added to the mode
6387    name string. */
6388
6389    CommandMessage.MakeEmpty ();
6390    CommandMessage.what = B_SET_PROPERTY;
6391    CommandMessage.AddSpecifier (g_PropertyNames[PN_TOKENIZE_MODE]);
6392    CommandMessage.AddString (g_DataName, g_TokenizeModeNames[TokenizeMode]);
6393    strcpy (TempString, g_TokenizeModeNames[TokenizeMode]);
6394    switch (TokenizeMode)
6395    {
6396      case TM_WHOLE:
6397        strcat (TempString, " - Scan everything");
6398        break;
6399
6400      case TM_PLAIN_TEXT:
6401        strcat (TempString, " - Scan e-mail body text except rich text");
6402        break;
6403
6404      case TM_PLAIN_TEXT_HEADER:
6405        strcat (TempString, " - Scan entire e-mail text except rich text");
6406        break;
6407
6408      case TM_ANY_TEXT:
6409        strcat (TempString, " - Scan e-mail body text and text attachments");
6410        break;
6411
6412      case TM_ANY_TEXT_HEADER:
6413       strcat (TempString, " - Scan entire e-mail text and text attachments (recommended)");
6414        break;
6415
6416      case TM_ALL_PARTS:
6417        strcat (TempString, " - Scan e-mail body and all attachments");
6418        break;
6419
6420      case TM_ALL_PARTS_HEADER:
6421        strcat (TempString, " - Scan all parts of the e-mail");
6422        break;
6423
6424      case TM_JUST_HEADER:
6425        strcat (TempString, " - Scan just the header (mail routing information)");
6426        break;
6427
6428      default:
6429        break;
6430    }
6431    TempMenuItemPntr =
6432      new BMenuItem (TempString, new BMessage (CommandMessage));
6433    if (TempMenuItemPntr == NULL) goto ErrorExit;
6434    TempMenuItemPntr->SetTarget (be_app);
6435    m_TokenizeModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6436  }
6437  m_TokenizeModeMenuBarPntr->AddItem (m_TokenizeModePopUpMenuPntr);
6438  AddChild (m_TokenizeModeMenuBarPntr);
6439
6440  /* This row just contains a huge pop-up menu which shows the scoring mode
6441  and an explanation of what each mode does. */
6442
6443  RowTop += RowHeight /* previous row's RowHeight */;
6444  TempRect = Bounds ();
6445  RowHeight = g_PopUpMenuHeight;
6446  RowHeight = ceilf (RowHeight * 1.1);
6447
6448  Margin = ceilf ((RowHeight - g_PopUpMenuHeight) / 2);
6449  TempRect.top = RowTop + Margin;
6450  TempRect.bottom = TempRect.top + g_PopUpMenuHeight;
6451
6452  m_ScoringModeCachedValue = SM_MAX; /* Illegal value will force redraw. */
6453  m_ScoringModeMenuBarPntr = new BMenuBar (TempRect, "ScoringModeMenuBar",
6454    B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP, B_ITEMS_IN_COLUMN,
6455    false /* resize to fit items */);
6456  if (m_ScoringModeMenuBarPntr == NULL) goto ErrorExit;
6457  m_ScoringModePopUpMenuPntr = new BPopUpMenu ("ScoringModePopUpMenu");
6458  if (m_ScoringModePopUpMenuPntr == NULL) goto ErrorExit;
6459
6460  for (ScoringMode = (ScoringModes) 0;
6461  ScoringMode < SM_MAX;
6462  ScoringMode = (ScoringModes) ((int) ScoringMode + 1))
6463  {
6464    /* Each different scoring mode gets its own menu item.  Selecting the item
6465    will send a canned command to the application to switch to the appropriate
6466    scoring mode.  An optional explanation of each mode is added to the mode
6467    name string. */
6468
6469    CommandMessage.MakeEmpty ();
6470    CommandMessage.what = B_SET_PROPERTY;
6471    CommandMessage.AddSpecifier (g_PropertyNames[PN_SCORING_MODE]);
6472    CommandMessage.AddString (g_DataName, g_ScoringModeNames[ScoringMode]);
6473/*
6474    strcpy (TempString, g_ScoringModeNames[ScoringMode]);
6475    switch (ScoringMode)
6476    {
6477      case SM_ROBINSON:
6478        strcat (TempString, " - Learning Method 1: Naive Bayesian");
6479        break;
6480
6481      case SM_CHISQUARED:
6482        strcat (TempString, " - Learning Method 2: Chi-Squared");
6483        break;
6484
6485      default:
6486        break;
6487    }
6488*/
6489    switch (ScoringMode)
6490    {
6491      case SM_ROBINSON:
6492        strcpy (TempString, "Learning method 1: Naive Bayesian");
6493        break;
6494
6495      case SM_CHISQUARED:
6496        strcpy (TempString, "Learning method 2: Chi-Squared");
6497        break;
6498
6499      default:
6500        break;
6501    }
6502    TempMenuItemPntr =
6503      new BMenuItem (TempString, new BMessage (CommandMessage));
6504    if (TempMenuItemPntr == NULL) goto ErrorExit;
6505    TempMenuItemPntr->SetTarget (be_app);
6506    m_ScoringModePopUpMenuPntr->AddItem (TempMenuItemPntr);
6507  }
6508  m_ScoringModeMenuBarPntr->AddItem (m_ScoringModePopUpMenuPntr);
6509  AddChild (m_ScoringModeMenuBarPntr);
6510
6511  /* The next row has the install MIME types button and the reset to defaults
6512  button, one on the left and the other on the right. */
6513
6514  RowTop += RowHeight /* previous row's RowHeight */;
6515  TempRect = Bounds ();
6516  RowHeight = g_ButtonHeight;
6517  RowHeight = ceilf (RowHeight * 1.1);
6518
6519  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6520  TempRect.top = RowTop + Margin;
6521  TempRect.bottom = TempRect.top + g_ButtonHeight;
6522
6523  CommandMessage.MakeEmpty ();
6524  CommandMessage.what = B_EXECUTE_PROPERTY;
6525  CommandMessage.AddSpecifier (g_PropertyNames[PN_INSTALL_THINGS]);
6526  m_InstallThingsButtonPntr = new BButton (TempRect, "Install Button",
6527    "Install spam types",
6528    new BMessage (CommandMessage),
6529    B_FOLLOW_LEFT | B_FOLLOW_TOP);
6530  if (m_InstallThingsButtonPntr == NULL) goto ErrorExit;
6531  AddChild (m_InstallThingsButtonPntr);
6532  m_InstallThingsButtonPntr->SetTarget (be_app);
6533  m_InstallThingsButtonPntr->ResizeToPreferred ();
6534
6535  /* The Reset to Defaults button.  On the right side of the row. */
6536
6537  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6538  TempRect = Bounds ();
6539  TempRect.top = RowTop + Margin;
6540  TempRect.bottom = TempRect.top + g_ButtonHeight;
6541
6542  CommandMessage.MakeEmpty ();
6543  CommandMessage.what = B_EXECUTE_PROPERTY;
6544  CommandMessage.AddSpecifier (g_PropertyNames[PN_RESET_TO_DEFAULTS]);
6545  m_ResetToDefaultsButtonPntr = new BButton (TempRect, "Reset Button",
6546    "Default settings", new BMessage (CommandMessage),
6547    B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6548  if (m_ResetToDefaultsButtonPntr == NULL) goto ErrorExit;
6549  AddChild (m_ResetToDefaultsButtonPntr);
6550  m_ResetToDefaultsButtonPntr->SetTarget (be_app);
6551  m_ResetToDefaultsButtonPntr->ResizeToPreferred ();
6552  m_ResetToDefaultsButtonPntr->GetPreferredSize (&Width, &Height);
6553  m_ResetToDefaultsButtonPntr->MoveTo (TempRect.right - Width, TempRect.top);
6554
6555  /* The next row contains the Estimate, Add Examples and About buttons. */
6556
6557  RowTop += RowHeight /* previous row's RowHeight */;
6558  TempRect = Bounds ();
6559  X = TempRect.left;
6560  RowHeight = g_ButtonHeight;
6561  RowHeight = ceilf (RowHeight * 1.1);
6562
6563  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6564  TempRect.top = RowTop + Margin;
6565  TempRect.bottom = TempRect.top + g_ButtonHeight;
6566  TempRect.left = X;
6567
6568  m_EstimateSpamButtonPntr = new BButton (TempRect, "Estimate Button",
6569    "Scan a message",
6570    new BMessage (MSG_ESTIMATE_BUTTON),
6571    B_FOLLOW_LEFT | B_FOLLOW_TOP);
6572  if (m_EstimateSpamButtonPntr == NULL) goto ErrorExit;
6573  AddChild (m_EstimateSpamButtonPntr);
6574  m_EstimateSpamButtonPntr->SetTarget (this);
6575  m_EstimateSpamButtonPntr->ResizeToPreferred ();
6576  X = m_EstimateSpamButtonPntr->Frame().right + g_MarginBetweenControls;
6577
6578  /* The Add Example button in the middle.  Does the same as the browse button,
6579  but don't tell anyone that! */
6580
6581  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6582  TempRect.top = RowTop + Margin;
6583  TempRect.bottom = TempRect.top + g_ButtonHeight;
6584  TempRect.left = X;
6585
6586  m_AddExampleButtonPntr = new BButton (TempRect, "Example Button",
6587    "Train spam filter on a message",
6588    new BMessage (MSG_BROWSE_BUTTON),
6589    B_FOLLOW_LEFT_RIGHT | B_FOLLOW_TOP,
6590    B_WILL_DRAW | B_NAVIGABLE | B_FULL_UPDATE_ON_RESIZE);
6591  if (m_AddExampleButtonPntr == NULL) goto ErrorExit;
6592  AddChild (m_AddExampleButtonPntr);
6593  m_AddExampleButtonPntr->SetTarget (this);
6594  m_AddExampleButtonPntr->ResizeToPreferred ();
6595  X = m_AddExampleButtonPntr->Frame().right + g_MarginBetweenControls;
6596
6597  /* Add the About button on the right. */
6598
6599  Margin = ceilf ((RowHeight - g_ButtonHeight) / 2);
6600  TempRect = Bounds ();
6601  TempRect.top = RowTop + Margin;
6602  TempRect.bottom = TempRect.top + g_ButtonHeight;
6603  TempRect.left = X;
6604
6605  m_AboutButtonPntr = new BButton (TempRect, "About Button",
6606    "About���",
6607    new BMessage (B_ABOUT_REQUESTED),
6608    B_FOLLOW_RIGHT | B_FOLLOW_TOP);
6609  if (m_AboutButtonPntr == NULL) goto ErrorExit;
6610  AddChild (m_AboutButtonPntr);
6611  m_AboutButtonPntr->SetTarget (be_app);
6612
6613  /* This row displays various counters.  Starting with the genuine messages
6614  count on the left. */
6615
6616  RowTop += RowHeight /* previous row's RowHeight */;
6617  TempRect = Bounds ();
6618  RowHeight = g_TextBoxHeight;
6619  RowHeight = ceilf (RowHeight * 1.1);
6620
6621  StringPntr = "Genuine messages:";
6622  m_GenuineCountCachedValue = 87654321;
6623  sprintf (TempString, "%d", (int) m_GenuineCountCachedValue);
6624
6625  Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6626  TempRect = Bounds ();
6627  TempRect.top = RowTop + Margin;
6628  TempRect.bottom = TempRect.top + g_TextBoxHeight;
6629  TempRect.right = TempRect.left +
6630    be_plain_font->StringWidth (StringPntr) +
6631    be_plain_font->StringWidth (TempString) +
6632    3 * g_MarginBetweenControls;
6633
6634  m_GenuineCountTextboxPntr = new BTextControl (TempRect,
6635    "Genuine count",
6636    StringPntr /* label */,
6637    TempString /* text */,
6638    NULL /* no message */,
6639    B_FOLLOW_LEFT | B_FOLLOW_TOP,
6640    B_WILL_DRAW /* not B_NAVIGABLE */);
6641  AddChild (m_GenuineCountTextboxPntr);
6642  m_GenuineCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6643  m_GenuineCountTextboxPntr->SetDivider (
6644    be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6645  m_GenuineCountTextboxPntr->SetEnabled (false); /* For display only. */
6646
6647  /* The word count in the center. */
6648
6649  StringPntr = "Word count:";
6650  m_WordCountCachedValue = 87654321;
6651  sprintf (TempString, "%d", (int) m_WordCountCachedValue);
6652
6653  Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6654  TempRect = Bounds ();
6655  TempRect.top = RowTop + Margin;
6656  TempRect.bottom = TempRect.top + g_TextBoxHeight;
6657  Width = be_plain_font->StringWidth (StringPntr) +
6658    be_plain_font->StringWidth (TempString) +
6659    3 * g_MarginBetweenControls;
6660  TempRect.left = ceilf ((TempRect.right - TempRect.left) / 2 - Width / 2);
6661  TempRect.right = TempRect.left + Width;
6662
6663  m_WordCountTextboxPntr = new BTextControl (TempRect,
6664    "Word count",
6665    StringPntr /* label */,
6666    TempString /* text */,
6667    NULL /* no message */,
6668    B_FOLLOW_H_CENTER | B_FOLLOW_TOP,
6669    B_WILL_DRAW /* not B_NAVIGABLE */);
6670  AddChild (m_WordCountTextboxPntr);
6671  m_WordCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6672  m_WordCountTextboxPntr->SetDivider (
6673    be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6674  m_WordCountTextboxPntr->SetEnabled (false); /* For display only. */
6675
6676  /* The spam count on the far right. */
6677
6678  StringPntr = "Spam messages:";
6679  m_SpamCountCachedValue = 87654321;
6680  sprintf (TempString, "%d", (int) m_SpamCountCachedValue);
6681
6682  Margin = ceilf ((RowHeight - g_TextBoxHeight) / 2);
6683  TempRect = Bounds ();
6684  TempRect.top = RowTop + Margin;
6685  TempRect.bottom = TempRect.top + g_TextBoxHeight;
6686  TempRect.left = TempRect.right -
6687    be_plain_font->StringWidth (StringPntr) -
6688    be_plain_font->StringWidth (TempString) -
6689    3 * g_MarginBetweenControls;
6690
6691  m_SpamCountTextboxPntr = new BTextControl (TempRect,
6692    "Spam count",
6693    StringPntr /* label */,
6694    TempString /* text */,
6695    NULL /* no message */,
6696    B_FOLLOW_RIGHT | B_FOLLOW_TOP,
6697    B_WILL_DRAW /* not B_NAVIGABLE */);
6698  AddChild (m_SpamCountTextboxPntr);
6699  m_SpamCountTextboxPntr->SetTarget (this); /* Not that it matters. */
6700  m_SpamCountTextboxPntr->SetDivider (
6701    be_plain_font->StringWidth (StringPntr) + g_MarginBetweenControls);
6702  m_SpamCountTextboxPntr->SetEnabled (false); /* For display only. */
6703
6704  /* Change the size of our view so it only takes up the space needed by the
6705  buttons. */
6706
6707  RowTop += RowHeight /* previous row's RowHeight */;
6708  ResizeTo (Bounds().Width(), RowTop - Bounds().top + 1);
6709
6710  return; /* Successful. */
6711
6712ErrorExit:
6713  DisplayErrorMessage ("Unable to initialise the controls view.");
6714}
6715
6716
6717void
6718ControlsView::BrowseForDatabaseFile ()
6719{
6720  if (m_BrowseFilePanelPntr == NULL)
6721  {
6722    BEntry      DirectoryEntry;
6723    entry_ref   DirectoryEntryRef;
6724    BMessage    GetDatabasePathCommand;
6725    BMessage    GetDatabasePathResult;
6726    const char *StringPntr = NULL;
6727
6728    /* Create a new file panel.  First set up the entry ref stuff so that the
6729    file panel can open to show the initial directory (the one where the
6730    database file currently is).  Note that we have to create it after the
6731    window and view are up and running, otherwise the BMessenger won't point to
6732    a valid looper/handler.  First find out the current database file name to
6733    use as a starting point. */
6734
6735    GetDatabasePathCommand.what = B_GET_PROPERTY;
6736    GetDatabasePathCommand.AddSpecifier (g_PropertyNames[PN_DATABASE_FILE]);
6737    be_app_messenger.SendMessage (&GetDatabasePathCommand,
6738      &GetDatabasePathResult, 5000000 /* delivery timeout */,
6739      5000000 /* reply timeout */);
6740    if (GetDatabasePathResult.FindString (g_ResultName, &StringPntr) != B_OK ||
6741    DirectoryEntry.SetTo (StringPntr) != B_OK ||
6742    DirectoryEntry.GetParent (&DirectoryEntry) != B_OK)
6743      DirectoryEntry.SetTo ("."); /* Default directory if we can't find it. */
6744    if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6745    {
6746      DisplayErrorMessage (
6747        "Unable to set up the file requestor starting directory.  Sorry.");
6748      return;
6749    }
6750
6751    m_BrowseFilePanelPntr = new BFilePanel (
6752      B_OPEN_PANEL /* mode */,
6753      &be_app_messenger /* target for event messages */,
6754      &DirectoryEntryRef /* starting directory */,
6755      B_FILE_NODE,
6756      true /* true for multiple selections */,
6757      NULL /* canned message */,
6758      NULL /* ref filter */,
6759      false /* true for modal */,
6760      true /* true to hide when done */);
6761  }
6762
6763  if (m_BrowseFilePanelPntr != NULL)
6764    m_BrowseFilePanelPntr->Show (); /* Answer returned later in RefsReceived. */
6765}
6766
6767
6768void
6769ControlsView::BrowseForFileToEstimate ()
6770{
6771  if (m_EstimateSpamFilePanelPntr == NULL)
6772  {
6773    BEntry      DirectoryEntry;
6774    entry_ref   DirectoryEntryRef;
6775    status_t    ErrorCode;
6776    BMessenger  MessengerToSelf (this);
6777    BPath       PathToMailDirectory;
6778
6779    /* Create a new file panel.  First set up the entry ref stuff so that the
6780    file panel can open to show the initial directory (the user's mail
6781    directory).  Note that we have to create the panel after the window and
6782    view are up and running, otherwise the BMessenger won't point to a valid
6783    looper/handler. */
6784
6785    ErrorCode = find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6786    if (ErrorCode == B_OK)
6787    {
6788      PathToMailDirectory.Append ("mail");
6789      ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(),
6790        true /* traverse symbolic links*/);
6791      if (ErrorCode != B_OK || !DirectoryEntry.Exists ())
6792      {
6793        /* If no mail directory, try home directory. */
6794        find_directory (B_USER_DIRECTORY, &PathToMailDirectory);
6795        ErrorCode = DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6796      }
6797    }
6798    if (ErrorCode != B_OK)
6799      PathToMailDirectory.SetTo (".");
6800
6801    DirectoryEntry.SetTo (PathToMailDirectory.Path(), true);
6802    if (DirectoryEntry.GetRef (&DirectoryEntryRef) != B_OK)
6803    {
6804      DisplayErrorMessage (
6805        "Unable to set up the file requestor starting directory.  Sorry.");
6806      return;
6807    }
6808
6809    m_EstimateSpamFilePanelPntr = new BFilePanel (
6810      B_OPEN_PANEL /* mode */,
6811      &MessengerToSelf /* target for event messages */,
6812      &DirectoryEntryRef /* starting directory */,
6813      B_FILE_NODE,
6814      true /* true for multiple selections */,
6815      new BMessage (MSG_ESTIMATE_FILE_REFS) /* canned message */,
6816      NULL /* ref filter */,
6817      false /* true for modal */,
6818      true /* true to hide when done */);
6819  }
6820
6821  if (m_EstimateSpamFilePanelPntr != NULL)
6822    m_EstimateSpamFilePanelPntr->Show (); /* Answer sent via a message. */
6823}
6824
6825
6826/* The display has been resized.  Have to manually adjust the popup menu bar to
6827show the new size (the sub-items need to be resized too).  Then make it redraw.
6828Well, actually just resetting the mark on the current item will resize it
6829properly. */
6830
6831void
6832ControlsView::FrameResized (float, float)
6833{
6834  m_ScoringModeCachedValue = SM_MAX; /* Force it to reset the mark. */
6835  m_TokenizeModeCachedValue = TM_MAX; /* Force it to reset the mark. */
6836}
6837
6838
6839void
6840ControlsView::MessageReceived (BMessage *MessagePntr)
6841{
6842  BMessage CommandMessage;
6843  bool     TempBool;
6844  uint32   TempUint32;
6845
6846  switch (MessagePntr->what)
6847  {
6848    case MSG_BROWSE_BUTTON:
6849      BrowseForDatabaseFile ();
6850      break;
6851
6852    case MSG_DATABASE_NAME:
6853      if (strcmp (m_DatabaseFileNameCachedValue,
6854      m_DatabaseFileNameTextboxPntr->Text ()) != 0)
6855        SubmitCommandString (PN_DATABASE_FILE, B_SET_PROPERTY,
6856        m_DatabaseFileNameTextboxPntr->Text ());
6857      break;
6858
6859    case MSG_ESTIMATE_BUTTON:
6860      BrowseForFileToEstimate ();
6861      break;
6862
6863    case MSG_ESTIMATE_FILE_REFS:
6864      EstimateRefFilesAndDisplay (MessagePntr);
6865      break;
6866
6867    case MSG_IGNORE_CLASSIFICATION:
6868      TempBool = (m_IgnorePreviousClassCheckboxPntr->Value() == B_CONTROL_ON);
6869      if (m_IgnorePreviousClassCachedValue != TempBool)
6870        SubmitCommandBool (PN_IGNORE_PREVIOUS_CLASSIFICATION,
6871        B_SET_PROPERTY, TempBool);
6872      break;
6873
6874    case MSG_PURGE_AGE:
6875      TempUint32 = strtoul (m_PurgeAgeTextboxPntr->Text (), NULL, 10);
6876      if (m_PurgeAgeCachedValue != TempUint32)
6877        SubmitCommandInt32 (PN_PURGE_AGE, B_SET_PROPERTY, TempUint32);
6878      break;
6879
6880    case MSG_PURGE_POPULARITY:
6881      TempUint32 = strtoul (m_PurgePopularityTextboxPntr->Text (), NULL, 10);
6882      if (m_PurgePopularityCachedValue != TempUint32)
6883        SubmitCommandInt32 (PN_PURGE_POPULARITY, B_SET_PROPERTY, TempUint32);
6884      break;
6885
6886    case MSG_SERVER_MODE:
6887      TempBool = (m_ServerModeCheckboxPntr->Value() == B_CONTROL_ON);
6888      if (m_ServerModeCachedValue != TempBool)
6889        SubmitCommandBool (PN_SERVER_MODE, B_SET_PROPERTY, TempBool);
6890      break;
6891
6892    default:
6893      BView::MessageReceived (MessagePntr);
6894  }
6895}
6896
6897
6898/* Check the server for changes in the state of the database, and if there are
6899any changes, update the displayed values.  Since this is a read only
6900examination of the server, we go directly to the application rather than
6901sending it messages.  Also, when sending messages, we can't find out what it is
6902doing while it is busy with a batch of spam additions (all the spam add
6903commands will be in the queue ahead of our requests for info).  Instead, we
6904lock the BApplication (so it isn't changing things while we're looking) and
6905retrieve our values. */
6906
6907void
6908ControlsView::PollServerForChanges ()
6909{
6910  ABSApp     *MyAppPntr;
6911  BMenuItem  *TempMenuItemPntr;
6912  char        TempString [PATH_MAX];
6913  BWindow    *WindowPntr;
6914
6915  /* We need a pointer to our window, for changing the title etc. */
6916
6917  WindowPntr = Window ();
6918  if (WindowPntr == NULL)
6919    return; /* No window, no point in updating the display! */
6920
6921  /* Check the server mode flag.  If the mode is off, then the window has to be
6922  minimized.  Similarly, if it gets turned on, maximize the window.  Note that
6923  the user can maximize the window manually, even while still in server mode.
6924  */
6925
6926  if (g_ServerMode != m_ServerModeCachedValue &&
6927  m_ServerModeCheckboxPntr != NULL)
6928  {
6929    m_ServerModeCachedValue = g_ServerMode;
6930    m_ServerModeCheckboxPntr->SetValue (
6931      m_ServerModeCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6932    WindowPntr->Minimize (m_ServerModeCachedValue);
6933  }
6934
6935  if (WindowPntr->IsMinimized ())
6936    return; /* Window isn't visible, don't waste time updating it. */
6937
6938  /* So that people don't stare at a blank screen, request a database load if
6939  nothing is there.  But only do it once, so the user doesn't get a lot of
6940  invalid database messages if one doesn't exist yet.  In server mode, we never
6941  get this far so it is only loaded when the user wants to see something. */
6942
6943  if (!m_DatabaseLoadDone)
6944  {
6945    m_DatabaseLoadDone = true;
6946    /* Counting the number of words will load the database. */
6947    SubmitCommandString (PN_DATABASE_FILE, B_COUNT_PROPERTIES, "");
6948  }
6949
6950  /* Check various read only values, which can be read from the BApplication
6951  without having to lock it.  This is useful for displaying the number of words
6952  as it is changing.  First up is the purge age setting. */
6953
6954  MyAppPntr = dynamic_cast<ABSApp *> (be_app);
6955  if (MyAppPntr == NULL)
6956    return; /* Doesn't exist or is the wrong class.  Not likely! */
6957
6958  if (MyAppPntr->m_PurgeAge != m_PurgeAgeCachedValue &&
6959  m_PurgeAgeTextboxPntr != NULL)
6960  {
6961    m_PurgeAgeCachedValue = MyAppPntr->m_PurgeAge;
6962    sprintf (TempString, "%" B_PRIu32, m_PurgeAgeCachedValue);
6963    m_PurgeAgeTextboxPntr->SetText (TempString);
6964  }
6965
6966  /* Check the purge popularity. */
6967
6968  if (MyAppPntr->m_PurgePopularity != m_PurgePopularityCachedValue &&
6969  m_PurgePopularityTextboxPntr != NULL)
6970  {
6971    m_PurgePopularityCachedValue = MyAppPntr->m_PurgePopularity;
6972    sprintf (TempString, "%" B_PRIu32, m_PurgePopularityCachedValue);
6973    m_PurgePopularityTextboxPntr->SetText (TempString);
6974  }
6975
6976  /* Check the Ignore Previous Classification flag. */
6977
6978  if (MyAppPntr->m_IgnorePreviousClassification !=
6979  m_IgnorePreviousClassCachedValue &&
6980  m_IgnorePreviousClassCheckboxPntr != NULL)
6981  {
6982    m_IgnorePreviousClassCachedValue =
6983      MyAppPntr->m_IgnorePreviousClassification;
6984    m_IgnorePreviousClassCheckboxPntr->SetValue (
6985      m_IgnorePreviousClassCachedValue ? B_CONTROL_ON : B_CONTROL_OFF);
6986  }
6987
6988  /* Update the genuine count. */
6989
6990  if (MyAppPntr->m_TotalGenuineMessages != m_GenuineCountCachedValue &&
6991  m_GenuineCountTextboxPntr != NULL)
6992  {
6993    m_GenuineCountCachedValue = MyAppPntr->m_TotalGenuineMessages;
6994    sprintf (TempString, "%" B_PRIu32, m_GenuineCountCachedValue);
6995    m_GenuineCountTextboxPntr->SetText (TempString);
6996  }
6997
6998  /* Update the spam count. */
6999
7000  if (MyAppPntr->m_TotalSpamMessages != m_SpamCountCachedValue &&
7001  m_SpamCountTextboxPntr != NULL)
7002  {
7003    m_SpamCountCachedValue = MyAppPntr->m_TotalSpamMessages;
7004    sprintf (TempString, "%" B_PRIu32, m_SpamCountCachedValue);
7005    m_SpamCountTextboxPntr->SetText (TempString);
7006  }
7007
7008  /* Update the word count. */
7009
7010  if (MyAppPntr->m_WordCount != m_WordCountCachedValue &&
7011  m_WordCountTextboxPntr != NULL)
7012  {
7013    m_WordCountCachedValue = MyAppPntr->m_WordCount;
7014    sprintf (TempString, "%" B_PRIu32, m_WordCountCachedValue);
7015    m_WordCountTextboxPntr->SetText (TempString);
7016  }
7017
7018  /* Update the tokenize mode pop-up menu. */
7019
7020  if (MyAppPntr->m_TokenizeMode != m_TokenizeModeCachedValue &&
7021  m_TokenizeModePopUpMenuPntr != NULL)
7022  {
7023    m_TokenizeModeCachedValue = MyAppPntr->m_TokenizeMode;
7024    TempMenuItemPntr =
7025      m_TokenizeModePopUpMenuPntr->ItemAt ((int) m_TokenizeModeCachedValue);
7026    if (TempMenuItemPntr != NULL)
7027      TempMenuItemPntr->SetMarked (true);
7028  }
7029
7030  /* Update the scoring mode pop-up menu. */
7031
7032  if (MyAppPntr->m_ScoringMode != m_ScoringModeCachedValue &&
7033  m_ScoringModePopUpMenuPntr != NULL)
7034  {
7035    m_ScoringModeCachedValue = MyAppPntr->m_ScoringMode;
7036    TempMenuItemPntr =
7037      m_ScoringModePopUpMenuPntr->ItemAt ((int) m_ScoringModeCachedValue);
7038    if (TempMenuItemPntr != NULL)
7039      TempMenuItemPntr->SetMarked (true);
7040  }
7041
7042  /* Lock the application.  This will stop it from processing any further
7043  messages until we are done.  Or if it is busy, the lock will fail. */
7044
7045  if (MyAppPntr->LockWithTimeout (100000) != B_OK)
7046    return; /* It's probably busy doing something. */
7047
7048  /* See if the database file name has changed. */
7049
7050  if (strcmp (MyAppPntr->m_DatabaseFileName.String (),
7051  m_DatabaseFileNameCachedValue) != 0 &&
7052  m_DatabaseFileNameTextboxPntr != NULL)
7053  {
7054    strcpy (m_DatabaseFileNameCachedValue,
7055      MyAppPntr->m_DatabaseFileName.String ());
7056    m_DatabaseFileNameTextboxPntr->SetText (m_DatabaseFileNameCachedValue);
7057    WindowPntr->SetTitle (m_DatabaseFileNameCachedValue);
7058  }
7059
7060  /* Done.  Let the BApplication continue processing messages. */
7061
7062  MyAppPntr->Unlock ();
7063}
7064
7065
7066void
7067ControlsView::Pulse ()
7068{
7069  if (system_time () > m_TimeOfLastPoll + 200000)
7070  {
7071    PollServerForChanges ();
7072    m_TimeOfLastPoll = system_time ();
7073  }
7074}
7075
7076
7077
7078/******************************************************************************
7079 * Implementation of the DatabaseWindow class, constructor, destructor and the
7080 * rest of the member functions in mostly alphabetical order.
7081 */
7082
7083DatabaseWindow::DatabaseWindow ()
7084: BWindow (BRect (30, 30, 620, 400),
7085    "Haiku spam filter server",
7086    B_DOCUMENT_WINDOW, B_ASYNCHRONOUS_CONTROLS)
7087{
7088  BRect TempRect;
7089
7090  /* Add the controls view. */
7091
7092  m_ControlsViewPntr = new ControlsView (Bounds ());
7093  if (m_ControlsViewPntr == NULL)
7094    goto ErrorExit;
7095  AddChild (m_ControlsViewPntr);
7096
7097  /* Add the word view in the remaining space under the controls view. */
7098
7099
7100  TempRect = Bounds ();
7101  TempRect.top = m_ControlsViewPntr->Frame().bottom + 1;
7102  m_WordsViewPntr = new WordsView (TempRect);
7103  if (m_WordsViewPntr == NULL)
7104    goto ErrorExit;
7105  AddChild (m_WordsViewPntr);
7106
7107 /* Minimize the window if we are starting up in server mode.  This is done
7108	before the window is open so it doesn't flash onto the screen, and possibly
7109	steal a keystroke or two.  The ControlsView will further update the minimize
7110	mode when it detects changes in the server mode. */
7111  Minimize (g_ServerMode);
7112
7113  return;
7114
7115ErrorExit:
7116  DisplayErrorMessage ("Unable to initialise the window contents.");
7117}
7118
7119
7120void
7121DatabaseWindow::MessageReceived (BMessage *MessagePntr)
7122{
7123  if (MessagePntr->what == B_MOUSE_WHEEL_CHANGED)
7124  {
7125    /* Pass the mouse wheel stuff down to the words view, since that's the only
7126    one which does scrolling so we don't need to worry about whether it has
7127    focus or not. */
7128
7129    if (m_WordsViewPntr != NULL)
7130      m_WordsViewPntr->MessageReceived (MessagePntr);
7131  }
7132  else
7133    BWindow::MessageReceived (MessagePntr);
7134}
7135
7136
7137bool
7138DatabaseWindow::QuitRequested ()
7139{
7140  be_app->PostMessage (B_QUIT_REQUESTED);
7141  return true;
7142}
7143
7144
7145
7146/******************************************************************************
7147 * Implementation of the word display view.
7148 */
7149
7150WordsView::WordsView (BRect NewBounds)
7151: BView (NewBounds, "WordsView", B_FOLLOW_ALL_SIDES,
7152    B_WILL_DRAW | B_FULL_UPDATE_ON_RESIZE | B_NAVIGABLE | B_PULSE_NEEDED),
7153  m_ArrowLineDownPntr (NULL),
7154  m_ArrowLineUpPntr (NULL),
7155  m_ArrowPageDownPntr (NULL),
7156  m_ArrowPageUpPntr (NULL),
7157  m_LastTimeAKeyWasPressed (0)
7158{
7159  font_height TempFontHeight;
7160
7161  GetFont (&m_TextFont); /* Modify the default font to be our own. */
7162  m_TextFont.SetSize (ceilf (m_TextFont.Size() * 1.1));
7163  m_TextFont.GetHeight (&TempFontHeight);
7164  SetFont (&m_TextFont);
7165
7166  m_LineHeight = ceilf (TempFontHeight.ascent +
7167    TempFontHeight.descent + TempFontHeight.leading);
7168  m_AscentHeight = ceilf (TempFontHeight.ascent);
7169  m_TextHeight = ceilf (TempFontHeight.ascent +
7170    TempFontHeight.descent);
7171
7172  m_FocusedColour.red = 255;
7173  m_FocusedColour.green = 255;
7174  m_FocusedColour.blue = 255;
7175  m_FocusedColour.alpha = 255;
7176
7177  m_UnfocusedColour.red = 245;
7178  m_UnfocusedColour.green = 245;
7179  m_UnfocusedColour.blue = 255;
7180  m_UnfocusedColour.alpha = 255;
7181
7182  m_BackgroundColour = m_UnfocusedColour;
7183  SetViewColor (m_BackgroundColour);
7184  SetLowColor (m_BackgroundColour);
7185  SetHighColor (0, 0, 0);
7186
7187  strcpy (m_FirstDisplayedWord, "a");
7188}
7189
7190
7191void
7192WordsView::AttachedToWindow ()
7193{
7194  BPolygon        DownLinePolygon (g_DownLinePoints,
7195                    sizeof (g_DownLinePoints) /
7196                    sizeof (g_DownLinePoints[0]));
7197
7198  BPolygon        DownPagePolygon (g_DownPagePoints,
7199                    sizeof (g_DownPagePoints) /
7200                    sizeof (g_DownPagePoints[0]));
7201
7202  BPolygon        UpLinePolygon (g_UpLinePoints,
7203                    sizeof (g_UpLinePoints) /
7204                    sizeof (g_UpLinePoints[0]));
7205
7206  BPolygon        UpPagePolygon (g_UpPagePoints,
7207                    sizeof (g_UpPagePoints) /
7208                    sizeof (g_UpPagePoints[0]));
7209
7210  BPicture        TempOffPicture;
7211  BPicture        TempOnPicture;
7212  BRect           TempRect;
7213
7214  /* Make the buttons and associated polygon images for the forward and
7215  backwards a word or a page of words buttons.  They're the width of the scroll
7216  bar area on the right, but twice as tall as usual, since there is no scroll
7217  bar and that will make it easier to use them.  First the up a line button. */
7218
7219  SetHighColor (0, 0, 0);
7220  BeginPicture (&TempOffPicture);
7221  FillPolygon (&UpLinePolygon);
7222  SetHighColor (180, 180, 180);
7223  StrokePolygon (&UpLinePolygon);
7224  EndPicture ();
7225
7226  SetHighColor (128, 128, 128);
7227  BeginPicture (&TempOnPicture);
7228  FillPolygon (&UpLinePolygon);
7229  EndPicture ();
7230
7231  TempRect = Bounds ();
7232  TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7233  TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7234  m_ArrowLineUpPntr = new BPictureButton (TempRect, "Up Line",
7235    &TempOffPicture, &TempOnPicture,
7236    new BMessage (MSG_LINE_UP), B_ONE_STATE_BUTTON,
7237    B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7238  if (m_ArrowLineUpPntr == NULL) goto ErrorExit;
7239  AddChild (m_ArrowLineUpPntr);
7240  m_ArrowLineUpPntr->SetTarget (this);
7241
7242  /* Up a page button. */
7243
7244  SetHighColor (0, 0, 0);
7245  BeginPicture (&TempOffPicture);
7246  FillPolygon (&UpPagePolygon);
7247  SetHighColor (180, 180, 180);
7248  StrokePolygon (&UpPagePolygon);
7249  EndPicture ();
7250
7251  SetHighColor (128, 128, 128);
7252  BeginPicture (&TempOnPicture);
7253  FillPolygon (&UpPagePolygon);
7254  EndPicture ();
7255
7256  TempRect = Bounds ();
7257  TempRect.top += 2 * B_H_SCROLL_BAR_HEIGHT + 1;
7258  TempRect.bottom = TempRect.top + 2 * B_H_SCROLL_BAR_HEIGHT;
7259  TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7260  m_ArrowPageUpPntr = new BPictureButton (TempRect, "Up Page",
7261    &TempOffPicture, &TempOnPicture,
7262    new BMessage (MSG_PAGE_UP), B_ONE_STATE_BUTTON,
7263    B_FOLLOW_RIGHT | B_FOLLOW_TOP, B_WILL_DRAW | B_NAVIGABLE);
7264  if (m_ArrowPageUpPntr == NULL) goto ErrorExit;
7265  AddChild (m_ArrowPageUpPntr);
7266  m_ArrowPageUpPntr->SetTarget (this);
7267
7268  /* Down a page button. */
7269
7270  SetHighColor (0, 0, 0);
7271  BeginPicture (&TempOffPicture);
7272  FillPolygon (&DownPagePolygon);
7273  SetHighColor (180, 180, 180);
7274  StrokePolygon (&DownPagePolygon);
7275  EndPicture ();
7276
7277  SetHighColor (128, 128, 128);
7278  BeginPicture (&TempOnPicture);
7279  FillPolygon (&DownPagePolygon);
7280  EndPicture ();
7281
7282  TempRect = Bounds ();
7283  TempRect.bottom -= 3 * B_H_SCROLL_BAR_HEIGHT + 1;
7284  TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7285  TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7286  m_ArrowPageDownPntr = new BPictureButton (TempRect, "Down Page",
7287    &TempOffPicture, &TempOnPicture,
7288    new BMessage (MSG_PAGE_DOWN), B_ONE_STATE_BUTTON,
7289    B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7290  if (m_ArrowPageDownPntr == NULL) goto ErrorExit;
7291  AddChild (m_ArrowPageDownPntr);
7292  m_ArrowPageDownPntr->SetTarget (this);
7293
7294  /* Down a line button. */
7295
7296  SetHighColor (0, 0, 0);
7297  BeginPicture (&TempOffPicture);
7298  FillPolygon (&DownLinePolygon);
7299  SetHighColor (180, 180, 180);
7300  StrokePolygon (&DownLinePolygon);
7301  EndPicture ();
7302
7303  SetHighColor (128, 128, 128);
7304  BeginPicture (&TempOnPicture);
7305  FillPolygon (&DownLinePolygon);
7306  EndPicture ();
7307
7308  TempRect = Bounds ();
7309  TempRect.bottom -= B_H_SCROLL_BAR_HEIGHT;
7310  TempRect.top = TempRect.bottom - 2 * B_H_SCROLL_BAR_HEIGHT;
7311  TempRect.left = TempRect.right - B_V_SCROLL_BAR_WIDTH;
7312  m_ArrowLineDownPntr = new BPictureButton (TempRect, "Down Line",
7313    &TempOffPicture, &TempOnPicture,
7314    new BMessage (MSG_LINE_DOWN), B_ONE_STATE_BUTTON,
7315    B_FOLLOW_RIGHT | B_FOLLOW_BOTTOM, B_WILL_DRAW | B_NAVIGABLE);
7316  if (m_ArrowLineDownPntr == NULL) goto ErrorExit;
7317  AddChild (m_ArrowLineDownPntr);
7318  m_ArrowLineDownPntr->SetTarget (this);
7319
7320  return;
7321
7322ErrorExit:
7323  DisplayErrorMessage ("Problems while making view displaying the words.");
7324}
7325
7326
7327/* Draw the words starting with the one at or after m_FirstDisplayedWord.  This
7328requires looking at the database in the BApplication, which may or may not be
7329available (if it isn't, don't draw, a redraw will usually be requested by the
7330Pulse member function when it keeps on noticing that the stuff on the display
7331doesn't match the database). */
7332
7333void
7334WordsView::Draw (BRect UpdateRect)
7335{
7336  float                   AgeDifference;
7337  float                   AgeProportion;
7338  float                   CenterX;
7339  float                   ColumnLeftCenterX;
7340  float                   ColumnMiddleCenterX;
7341  float                   ColumnRightCenterX;
7342  float                   CompensatedRatio;
7343  StatisticsMap::iterator DataIter;
7344  StatisticsMap::iterator EndIter;
7345  rgb_color               FillColour;
7346  float                   GenuineProportion;
7347  uint32                  GenuineSpamSum;
7348  float                   HeightPixels;
7349  float                   HeightProportion;
7350  float                   LeftBounds;
7351  ABSApp                 *MyAppPntr;
7352  uint32                  NewestAge;
7353  uint32                  OldestAge;
7354  float                   OneFifthTotalGenuine;
7355  float                   OneFifthTotalSpam;
7356  double                  RawProbabilityRatio;
7357  float                   RightBounds;
7358  float                   SpamProportion;
7359  StatisticsPointer       StatisticsPntr;
7360  BRect                   TempRect;
7361  char                    TempString [PATH_MAX];
7362  float                   TotalGenuineMessages = 1.0; /* Avoid divide by 0. */
7363  float                   TotalSpamMessages = 1.0;
7364  float                   Width;
7365  float                   Y;
7366
7367  /* Lock the application.  This will stop it from processing any further
7368  messages until we are done.  Or if it is busy, the lock will fail. */
7369
7370  MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7371  if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (100000) != B_OK)
7372    return; /* It's probably busy doing something. */
7373
7374  /* Set up various loop invariant variables. */
7375
7376  if (MyAppPntr->m_TotalGenuineMessages > 0)
7377    TotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7378  OneFifthTotalGenuine = TotalGenuineMessages / 5;
7379
7380  if (MyAppPntr->m_TotalSpamMessages > 0)
7381    TotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7382  OneFifthTotalSpam = TotalSpamMessages / 5;
7383
7384  EndIter = MyAppPntr->m_WordMap.end ();
7385
7386  OldestAge = MyAppPntr->m_OldestAge;
7387  NewestAge = /* actually newest age plus one */
7388    MyAppPntr->m_TotalGenuineMessages + MyAppPntr->m_TotalSpamMessages;
7389
7390  if (NewestAge == 0)
7391    goto NormalExit; /* No words to display, or something is badly wrong. */
7392
7393  NewestAge--; /* The newest message has age NewestAge. */
7394  AgeDifference = NewestAge - OldestAge; /* Can be zero if just one message. */
7395
7396  LeftBounds = Bounds().left;
7397  RightBounds = Bounds().right - B_V_SCROLL_BAR_WIDTH;
7398  Width = RightBounds - LeftBounds;
7399  FillColour.alpha = 255;
7400
7401  CenterX = ceilf (LeftBounds + Width * 0.5);
7402  ColumnLeftCenterX = ceilf (LeftBounds + Width * 0.05);
7403  ColumnMiddleCenterX = CenterX;
7404  ColumnRightCenterX = ceilf (LeftBounds + Width * 0.95);
7405
7406  for (DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord),
7407  Y = Bounds().top;
7408  DataIter != EndIter && Y < UpdateRect.bottom;
7409  DataIter++, Y += m_LineHeight)
7410  {
7411    if (Y + m_LineHeight < UpdateRect.top)
7412      continue; /* Not in the visible area yet, don't actually draw. */
7413
7414    /* Draw the colour bar behind the word.  It reflects the spamness or
7415    genuineness of that particular word, plus the importance of the word and
7416    the age of the word.
7417
7418    First calculate the compensated spam ratio (described elsewhere).  It is
7419    close to 0.0 for genuine words and close to 1.0 for pure spam.  It is drawn
7420    as a blue bar to the left of center if it is less than 0.5, and a red bar
7421    on the right of center if it is greater than 0.5.  At exactly 0.5 nothing
7422    is drawn; the word is worthless as an indicator.
7423
7424    The height of the bar corresponds to the number of messages the word was
7425    found in.  Make the height proportional to the total of spam and genuine
7426    messages for the word divided by the sum of the most extreme spam and
7427    genuine counts in the database.
7428
7429    The staturation of the colour corresponds to the age of the word, with old
7430    words being almost white rather than solid blue or red. */
7431
7432    StatisticsPntr = &DataIter->second;
7433
7434    SpamProportion = StatisticsPntr->spamCount / TotalSpamMessages;
7435    GenuineProportion = StatisticsPntr->genuineCount / TotalGenuineMessages;
7436    if (SpamProportion + GenuineProportion > 0.0f)
7437      RawProbabilityRatio =
7438      SpamProportion / (SpamProportion + GenuineProportion);
7439    else
7440      RawProbabilityRatio = g_RobinsonX;
7441
7442    /* The compensated ratio leans towards 0.5 (RobinsonX) more for fewer
7443    data points, with a weight of 0.45 (RobinsonS). */
7444
7445    GenuineSpamSum =
7446      StatisticsPntr->spamCount + StatisticsPntr->genuineCount;
7447    CompensatedRatio =
7448      (g_RobinsonS * g_RobinsonX + GenuineSpamSum * RawProbabilityRatio) /
7449      (g_RobinsonS + GenuineSpamSum);
7450
7451    /* Used to use the height based on the most frequent word, but some words,
7452    like "From", show up in all messages which made most other words just
7453    appear as a thin line.  I did a histogram plot of the sizes in my test
7454    database, and figured that you get better coverage of 90% of the messages
7455    if you use 1/5 of the total number as the count which gives you 100%
7456    height.  The other 10% get a full height bar, but most people wouldn't care
7457    that they're super frequently used. */
7458
7459    HeightProportion = 0.5f * (StatisticsPntr->genuineCount /
7460      OneFifthTotalGenuine + StatisticsPntr->spamCount / OneFifthTotalSpam);
7461
7462    if (HeightProportion > 1.0f)
7463      HeightProportion = 1.0f;
7464    HeightPixels = ceilf (HeightProportion * m_TextHeight);
7465
7466    if (AgeDifference <= 0.0f)
7467      AgeProportion = 1.0; /* New is 1.0, old is 0.0 */
7468    else
7469      AgeProportion = (StatisticsPntr->age - OldestAge) / AgeDifference;
7470
7471    TempRect.top = ceilf (Y + m_TextHeight / 2 - HeightPixels / 2);
7472    TempRect.bottom = TempRect.top + HeightPixels;
7473
7474    if (CompensatedRatio < 0.5f)
7475    {
7476      TempRect.left = ceilf (
7477        CenterX - 1.6f * (0.5f - CompensatedRatio) * (CenterX - LeftBounds));
7478      TempRect.right = CenterX;
7479      FillColour.red = 230 - (int) (AgeProportion * 230.0f);
7480      FillColour.green = FillColour.red;
7481      FillColour.blue = 255;
7482    }
7483    else /* Ratio >= 0.5, red spam block. */
7484    {
7485      TempRect.left = CenterX;
7486      TempRect.right = ceilf (
7487        CenterX + 1.6f * (CompensatedRatio - 0.5f) * (RightBounds - CenterX));
7488      FillColour.blue = 230 - (int) (AgeProportion * 230.0f);
7489      FillColour.green = FillColour.blue;
7490      FillColour.red = 255;
7491    }
7492    SetHighColor (FillColour);
7493    SetDrawingMode (B_OP_COPY);
7494    FillRect (TempRect);
7495
7496    /* Print the text centered in columns of various widths.  The number of
7497    genuine messages in the left 10% of the width, the word in the middle 80%,
7498    and the number of spam messages using the word in the right 10%. */
7499
7500    SetHighColor (0, 0, 0);
7501    SetDrawingMode (B_OP_OVER); /* So that antialiased text mixes better. */
7502
7503    sprintf (TempString, "%" B_PRIu32, StatisticsPntr->genuineCount);
7504    Width = m_TextFont.StringWidth (TempString);
7505    MovePenTo (ceilf (ColumnLeftCenterX - Width / 2), Y + m_AscentHeight);
7506    DrawString (TempString);
7507
7508    strcpy (TempString, DataIter->first.c_str ());
7509    Width = m_TextFont.StringWidth (TempString);
7510    MovePenTo (ceilf (ColumnMiddleCenterX - Width / 2), Y + m_AscentHeight);
7511    DrawString (TempString);
7512
7513    sprintf (TempString, "%" B_PRIu32, StatisticsPntr->spamCount);
7514    Width = m_TextFont.StringWidth (TempString);
7515    MovePenTo (ceilf (ColumnRightCenterX - Width / 2), Y + m_AscentHeight);
7516    DrawString (TempString);
7517  }
7518
7519  /* Draw the first word (the one which the user types in to select the first
7520  displayed word) on the right, in the scroll bar margin, rotated 90 degrees to
7521  fit between the page up and page down buttons. */
7522
7523  Width = m_TextFont.StringWidth (m_FirstDisplayedWord);
7524  if (Width > 0)
7525  {
7526    TempRect = Bounds ();
7527    TempRect.top += 4 * B_H_SCROLL_BAR_HEIGHT + 1;
7528    TempRect.bottom -= 5 * B_H_SCROLL_BAR_HEIGHT + 1;
7529
7530    MovePenTo (TempRect.right - m_TextHeight + m_AscentHeight - 1,
7531      ceilf ((TempRect.bottom + TempRect.top) / 2 + Width / 2));
7532    m_TextFont.SetRotation (90);
7533    SetFont (&m_TextFont, B_FONT_ROTATION);
7534    DrawString (m_FirstDisplayedWord);
7535    m_TextFont.SetRotation (0);
7536    SetFont (&m_TextFont, B_FONT_ROTATION);
7537  }
7538
7539NormalExit:
7540
7541  /* Successfully finished drawing.  Update the cached values to match what we
7542  have drawn. */
7543  m_CachedTotalGenuineMessages = MyAppPntr->m_TotalGenuineMessages;
7544  m_CachedTotalSpamMessages = MyAppPntr->m_TotalSpamMessages;
7545  m_CachedWordCount = MyAppPntr->m_WordCount;
7546
7547  /* Done.  Let the BApplication continue processing messages. */
7548  MyAppPntr->Unlock ();
7549}
7550
7551
7552/* When the user presses keys, they select the first word to be displayed in
7553the view (it's the word at or lexicographically after the word typed in).  The
7554keys are appended to the starting word, until the user stops typing for a
7555while, then the next key will be the first letter of a new starting word. */
7556
7557void
7558WordsView::KeyDown (const char *BufferPntr, int32 NumBytes)
7559{
7560  int32          CharLength;
7561  bigtime_t      CurrentTime;
7562  char           TempString [40];
7563
7564  CurrentTime = system_time ();
7565
7566  if (NumBytes < (int32) sizeof (TempString))
7567  {
7568    memcpy (TempString, BufferPntr, NumBytes);
7569    TempString [NumBytes] = 0;
7570    CharLength = strlen (TempString); /* So NUL bytes don't get through. */
7571
7572    /* Check for arrow keys, which move the view up and down. */
7573
7574    if (CharLength == 1 &&
7575    (TempString[0] == B_UP_ARROW ||
7576    TempString[0] == B_DOWN_ARROW ||
7577    TempString[0] == B_PAGE_UP ||
7578    TempString[0] == B_PAGE_DOWN))
7579    {
7580      MoveTextUpOrDown ((TempString[0] == B_UP_ARROW) ? MSG_LINE_UP :
7581        ((TempString[0] == B_DOWN_ARROW) ? MSG_LINE_DOWN :
7582        ((TempString[0] == B_PAGE_UP) ? MSG_PAGE_UP : MSG_PAGE_DOWN)));
7583    }
7584    else if (CharLength > 1 ||
7585    (CharLength == 1 && 32 <= (uint8) TempString[0]))
7586    {
7587      /* Have a non-control character, or some sort of multibyte char.  Add it
7588      to the word and mark things for redisplay starting at the resulting word.
7589      */
7590
7591      if (CurrentTime - m_LastTimeAKeyWasPressed >= 1000000 /* microseconds */)
7592        strcpy (m_FirstDisplayedWord, TempString); /* Starting a new word. */
7593      else if (strlen (m_FirstDisplayedWord) + CharLength <= g_MaxWordLength)
7594        strcat (m_FirstDisplayedWord, TempString); /* Append to existing. */
7595
7596      Invalidate ();
7597    }
7598  }
7599
7600  m_LastTimeAKeyWasPressed = CurrentTime;
7601  BView::KeyDown (BufferPntr, NumBytes);
7602}
7603
7604
7605/* Change the background colour to show that we have the focus.  When we have
7606it, keystrokes will select the word to be displayed at the top of the list. */
7607
7608void
7609WordsView::MakeFocus (bool Focused)
7610{
7611  if (Focused)
7612    m_BackgroundColour = m_FocusedColour;
7613  else
7614    m_BackgroundColour = m_UnfocusedColour;
7615  SetViewColor (m_BackgroundColour);
7616  SetLowColor (m_BackgroundColour);
7617
7618  /* Also need to set the background colour for the scroll buttons, since they
7619  can't be made transparent. */
7620
7621  if (m_ArrowLineDownPntr != NULL)
7622  {
7623    m_ArrowLineDownPntr->SetViewColor (m_BackgroundColour);
7624    m_ArrowLineDownPntr->Invalidate ();
7625  }
7626
7627  if (m_ArrowLineUpPntr != NULL)
7628  {
7629    m_ArrowLineUpPntr->SetViewColor (m_BackgroundColour);
7630    m_ArrowLineUpPntr->Invalidate ();
7631  }
7632
7633  if (m_ArrowPageDownPntr != NULL)
7634  {
7635    m_ArrowPageDownPntr->SetViewColor (m_BackgroundColour);
7636    m_ArrowPageDownPntr->Invalidate ();
7637  }
7638
7639  if (m_ArrowPageUpPntr != NULL)
7640  {
7641    m_ArrowPageUpPntr->SetViewColor (m_BackgroundColour);
7642    m_ArrowPageUpPntr->Invalidate ();
7643  }
7644
7645  Invalidate ();
7646
7647  BView::MakeFocus (Focused);
7648}
7649
7650
7651void
7652WordsView::MessageReceived (BMessage *MessagePntr)
7653{
7654  int32     CountFound;
7655  float     DeltaY; /* Usually -1.0, 0.0 or +1.0. */
7656  type_code TypeFound;
7657
7658  switch (MessagePntr->what)
7659  {
7660    case B_MOUSE_WHEEL_CHANGED:
7661      if (MessagePntr->FindFloat ("be:wheel_delta_y", &DeltaY) != 0) break;
7662      if (DeltaY < 0)
7663        MoveTextUpOrDown (MSG_LINE_UP);
7664      else if (DeltaY > 0)
7665        MoveTextUpOrDown (MSG_LINE_DOWN);
7666      break;
7667
7668    case MSG_LINE_DOWN:
7669    case MSG_LINE_UP:
7670    case MSG_PAGE_DOWN:
7671    case MSG_PAGE_UP:
7672      MoveTextUpOrDown (MessagePntr->what);
7673      break;
7674
7675    case B_SIMPLE_DATA: /* Something has been dropped in our view. */
7676      if (MessagePntr->GetInfo ("refs", &TypeFound, &CountFound) == B_OK &&
7677      CountFound > 0 && TypeFound == B_REF_TYPE)
7678      {
7679        RefsDroppedHere (MessagePntr);
7680        break;
7681      }
7682      /* Else fall through to the default case, in case it is something else
7683      dropped that the system knows about. */
7684
7685    default:
7686      BView::MessageReceived (MessagePntr);
7687  }
7688}
7689
7690
7691/* If the user clicks on our view, take over the focus. */
7692
7693void
7694WordsView::MouseDown (BPoint)
7695{
7696  if (!IsFocus ())
7697    MakeFocus (true);
7698}
7699
7700
7701void
7702WordsView::MoveTextUpOrDown (uint32 MovementType)
7703{
7704  StatisticsMap::iterator  DataIter;
7705  int                      i;
7706  ABSApp                  *MyAppPntr;
7707  int                      PageSize;
7708
7709  /* Lock the application.  This will stop it from processing any further
7710  messages until we are done (we need to look at the word list directly).  Or
7711  if it is busy, the lock will fail. */
7712
7713  MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7714  if (MyAppPntr == NULL || MyAppPntr->LockWithTimeout (2000000) != B_OK)
7715    return; /* It's probably busy doing something. */
7716
7717  PageSize = (int) (Bounds().Height() / m_LineHeight - 1);
7718  if (PageSize < 1)
7719    PageSize = 1;
7720
7721  DataIter = MyAppPntr->m_WordMap.lower_bound (m_FirstDisplayedWord);
7722
7723  switch (MovementType)
7724  {
7725    case MSG_LINE_UP:
7726      if (DataIter != MyAppPntr->m_WordMap.begin ())
7727        DataIter--;
7728      break;
7729
7730    case MSG_LINE_DOWN:
7731      if (DataIter != MyAppPntr->m_WordMap.end ())
7732        DataIter++;
7733      break;
7734
7735    case MSG_PAGE_UP:
7736      for (i = 0; i < PageSize; i++)
7737      {
7738        if (DataIter == MyAppPntr->m_WordMap.begin ())
7739          break;
7740        DataIter--;
7741      }
7742      break;
7743
7744    case MSG_PAGE_DOWN:
7745      for (i = 0; i < PageSize; i++)
7746      {
7747        if (DataIter == MyAppPntr->m_WordMap.end ())
7748          break;
7749        DataIter++;
7750      }
7751      break;
7752  }
7753
7754  if (DataIter != MyAppPntr->m_WordMap.end ())
7755    strcpy (m_FirstDisplayedWord, DataIter->first.c_str ());
7756
7757  Invalidate ();
7758
7759  MyAppPntr->Unlock ();
7760}
7761
7762
7763/* This function periodically polls the BApplication to see if anything has
7764changed.  If the word list is different or the display has changed in some
7765other way, it will then try to refresh the display, repeating the attempt until
7766it gets successfully drawn. */
7767
7768void
7769WordsView::Pulse ()
7770{
7771  ABSApp *MyAppPntr;
7772
7773  /* Probe the BApplication to see if it has changed. */
7774
7775  MyAppPntr = dynamic_cast<ABSApp *> (be_app);
7776  if (MyAppPntr == NULL)
7777    return; /* Something is wrong, give up. */
7778
7779  if (MyAppPntr->m_TotalGenuineMessages != m_CachedTotalGenuineMessages ||
7780  MyAppPntr->m_TotalSpamMessages != m_CachedTotalSpamMessages ||
7781  MyAppPntr->m_WordCount != m_CachedWordCount)
7782    Invalidate ();
7783}
7784
7785
7786/* The user has dragged and dropped some file references on the words view.  If
7787it is in the left third, add the file(s) as examples of genuine messages, right
7788third for spam messages and if it is in the middle third then evaluate the
7789file(s) for spaminess. */
7790
7791void
7792WordsView::RefsDroppedHere (BMessage *MessagePntr)
7793{
7794  float  Left;
7795  bool   SpamExample = true; /* TRUE if example is of spam, FALSE genuine. */
7796  float  Third;
7797  BPoint WhereDropped;
7798
7799  /* Find out which third of the view it was dropped into. */
7800
7801  if (MessagePntr->FindPoint ("_drop_point_", &WhereDropped) != B_OK)
7802    return;  /* Need to know where it was dropped. */
7803  ConvertFromScreen (&WhereDropped);
7804  Third = Bounds().Width() / 3;
7805  Left = Bounds().left;
7806  if (WhereDropped.x < Left + Third)
7807    SpamExample = false;
7808  else if (WhereDropped.x < Left + 2 * Third)
7809  {
7810    /* In the middle third, evaluate all files for spaminess. */
7811    EstimateRefFilesAndDisplay (MessagePntr);
7812    return;
7813  }
7814
7815  if (g_CommanderLooperPntr != NULL)
7816    g_CommanderLooperPntr->CommandReferences (
7817    MessagePntr, true /* BulkMode */, SpamExample ? CL_SPAM : CL_GENUINE);
7818}
7819
7820
7821
7822/******************************************************************************
7823 * Finally, the main program which drives it all.
7824 */
7825
7826int main (int argc, char**)
7827{
7828  g_CommandLineMode = (argc > 1);
7829  if (!g_CommandLineMode)
7830    cout << PrintUsage; /* In case no arguments specified. */
7831
7832  g_CommanderLooperPntr = new CommanderLooper;
7833  if (g_CommanderLooperPntr != NULL)
7834  {
7835    g_CommanderMessenger = new BMessenger (NULL, g_CommanderLooperPntr);
7836    g_CommanderLooperPntr->Run ();
7837  }
7838
7839  ABSApp MyApp;
7840
7841  if (MyApp.InitCheck () == 0)
7842  {
7843    MyApp.LoadSaveSettings (true /* DoLoad */);
7844    MyApp.Run ();
7845  }
7846
7847  if (g_CommanderLooperPntr != NULL)
7848  {
7849    g_CommanderLooperPntr->PostMessage (B_QUIT_REQUESTED);
7850    snooze (100000); /* Let the CommanderLooper thread run so it quits. */
7851  }
7852
7853  cerr << "SpamDBM shutting down..." << endl;
7854  return 0; /* And implicitly destroys MyApp, which writes out the database. */
7855}
7856