public void parse_email_messages()
{
string basepath = @"c:\EmailTest\Data";
// http://www.csharp-examples.net/get-files-from-directory/
string[] filePaths = Directory.GetFiles(basepath);
long doc_count = 1;
long word_count = 1;
DateTime start = DateTime.Now;
foreach (var fn in filePaths) {
String fullpath = Path.Combine(basepath, fn);
System.Console.WriteLine(fullpath);
FileStream r = File.Open(fullpath, FileMode.Open, FileAccess.Read, FileShare.Read);
BufferedStream reader = new BufferedStream(r);
// http://msdn.microsoft.com/en-us/library/system.io.streamreader.readline.aspx
List<string> lines = new List<string>();
LayerWriteGroup txwg = new LayerWriteGroup(db,type:LayerWriteGroup.WriteGroupType.MEMORY_ONLY);
while (reader.Position < reader.Length - 1) {
string line = UnixReadLine(reader);
if (line.Length > 6 && line.Substring(0, 5) == "From ") {
if (lines.Count > 0) {
string msg = String.Join("\n", lines);
doc_count++;
string docid = fullpath + ":" + doc_count;
int doc_numwords;
parse_msg(txwg, docid, msg, out doc_numwords);
word_count += doc_numwords;
DateTime cur = DateTime.Now;
double elapsed_s = (cur - start).TotalSeconds;
Console.WriteLine("doc{0}: {1} elapsed:{2} docs/sec:{3} words/sec:{4}",
doc_count, docid, elapsed_s, (float)doc_count / elapsed_s, (float)word_count / elapsed_s);
gui.debugDump(db);
// end after a certain number of lines...
if (doc_count > 4000000000) {
goto end_now;
}
}
lines = new List<string>();
} else {
lines.Add(line);
}
if (doc_count % 50000 == 0) { gui.debugDump(db); }
} // while adding docs
} // foreach file
Console.WriteLine("=================== EmailInjector end... time to fully optimize...");
end_now:
// we have to lock to assure we don't collide with the background merge thread
lock (db) {
// be sure to flush and merge before we search...
db.flushWorkingSegment();
gui.debugDump(db);
for (int x = 0; x < 40; x++) {
var mc = db.rangemapmgr.mergeManager.getBestCandidate();
gui.debugDump(db, mc);
if (mc == null) { break; }
db.performMerge(mc);
gui.debugDump(db);
}
}
}