protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix)
{
FieldInfo info = reader.FieldInfos.FieldInfo(Field);
if (info != null && info.HasDocValues())
{
throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType);
}
//System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
long startTime = DateTime.Now.Millisecond;
Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix);
int maxDoc = reader.MaxDoc;
int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
int[] lastTerm = new int[maxDoc]; // last term we saw for this document
sbyte[][] bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
Fields fields = reader.Fields;
if (fields == null)
{
// No terms
return;
}
Terms terms = fields.Terms(Field);
if (terms == null)
{
// No terms
return;
}
TermsEnum te = terms.Iterator(null);
BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
//System.out.println("seekStart=" + seekStart.utf8ToString());
if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END)
{
// No terms match
return;
}
// If we need our "term index wrapper", these will be
// init'd below:
IList<BytesRef> indexedTerms = null;
PagedBytes indexedTermsBytes = null;
bool testedOrd = false;
// we need a minimum of 9 bytes, but round up to 12 since the space would
// be wasted with most allocators anyway.
sbyte[] tempArr = new sbyte[12];
//
// enumerate all terms, and build an intermediate form of the un-inverted field.
//
// During this intermediate form, every document has a (potential) byte[]
// and the int[maxDoc()] array either contains the termNumber list directly
// or the *end* offset of the termNumber list in it's byte array (for faster
// appending and faster creation of the final form).
//
// idea... if things are too large while building, we could do a range of docs
// at a time (but it would be a fair amount slower to build)
// could also do ranges in parallel to take advantage of multiple CPUs
// OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
// values. this requires going over the field first to find the most
// frequent terms ahead of time.
int termNum = 0;
DocsEnum = null;
// Loop begins with te positioned to first term (we call
// seek above):
for (; ; )
{
BytesRef t = te.Term();
if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix)))
{
break;
}
//System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
if (!testedOrd)
{
try
{
OrdBase = (int)te.Ord();
//System.out.println("got ordBase=" + ordBase);
}
catch (System.NotSupportedException uoe)
{
// Reader cannot provide ord support, so we wrap
// our own support by creating our own terms index:
indexedTerms = new List<BytesRef>();
indexedTermsBytes = new PagedBytes(15);
//System.out.println("NO ORDS");
}
testedOrd = true;
}
VisitTerm(te, termNum);
if (indexedTerms != null && (termNum & IndexIntervalMask) == 0)
{
// Index this term
SizeOfIndexedStrings += t.Length;
BytesRef indexedTerm = new BytesRef();
indexedTermsBytes.Copy(t, indexedTerm);
// TODO: really should 1) strip off useless suffix,
// and 2) use FST not array/PagedBytes
indexedTerms.Add(indexedTerm);
}
int df = te.DocFreq();
if (df <= MaxTermDocFreq)
{
DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE);
// dF, but takes deletions into account
int actualDF = 0;
for (; ; )
{
int doc = DocsEnum.NextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS)
{
break;
}
//System.out.println(" chunk=" + chunk + " docs");
actualDF++;
TermInstances++;
//System.out.println(" docID=" + doc);
// add TNUM_OFFSET to the term number to make room for special reserved values:
// 0 (end term) and 1 (index into byte array follows)
int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
lastTerm[doc] = termNum;
int val = index[doc];
if ((val & 0xff) == 1)
{
// index into byte array (actually the end of
// the doc-specific byte[] when building)
int pos = (int)((uint)val >> 8);
int ilen = VIntSize(delta);
sbyte[] arr = bytes[doc];
int newend = pos + ilen;
if (newend > arr.Length)
{
// We avoid a doubling strategy to lower memory usage.
// this faceting method isn't for docs with many terms.
// In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
// TODO: figure out what array lengths we can round up to w/o actually using more memory
// (how much space does a byte[] take up? Is data preceded by a 32 bit length only?
// It should be safe to round up to the nearest 32 bits in any case.
int newLen = (newend + 3) & unchecked((int)0xfffffffc); // 4 byte alignment
sbyte[] newarr = new sbyte[newLen];
Array.Copy(arr, 0, newarr, 0, pos);
arr = newarr;
bytes[doc] = newarr;
}
pos = WriteInt(delta, arr, pos);
index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
}
else
{
// OK, this int has data in it... find the end (a zero starting byte - not
// part of another number, hence not following a byte with the high bit set).
int ipos;
if (val == 0)
{
ipos = 0;
}
else if ((val & 0x0000ff80) == 0)
{
ipos = 1;
}
else if ((val & 0x00ff8000) == 0)
{
ipos = 2;
}
else if ((val & 0xff800000) == 0)
{
ipos = 3;
}
else
{
ipos = 4;
}
//System.out.println(" ipos=" + ipos);
int endPos = WriteInt(delta, tempArr, ipos);
//System.out.println(" endpos=" + endPos);
if (endPos <= 4)
{
//System.out.println(" fits!");
// value will fit in the integer... move bytes back
for (int j = ipos; j < endPos; j++)
{
val |= (tempArr[j] & 0xff) << (j << 3);
}
index[doc] = val;
}
else
{
// value won't fit... move integer into byte[]
for (int j = 0; j < ipos; j++)
{
tempArr[j] = (sbyte)val;
val = (int)((uint)val >> 8);
}
// point at the end index in the byte[]
index[doc] = (endPos << 8) | 1;
bytes[doc] = tempArr;
tempArr = new sbyte[12];
}
}
}
SetActualDocFreq(termNum, actualDF);
}
termNum++;
if (te.Next() == null)
{
break;
}
}
NumTermsInField = termNum;
long midPoint = DateTime.Now.Millisecond;
if (TermInstances == 0)
{
// we didn't invert anything
// lower memory consumption.
Tnums = null;
}
else
{
this.Index = index;
//
// transform intermediate form into the final form, building a single byte[]
// at a time, and releasing the intermediate byte[]s as we go to avoid
// increasing the memory footprint.
//
for (int pass = 0; pass < 256; pass++)
{
sbyte[] target = Tnums[pass];
int pos = 0; // end in target;
if (target != null)
{
pos = target.Length;
}
else
{
target = new sbyte[4096];
}
// loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
// where pp is the pass (which array we are building), and xx is all values.
// each pass shares the same byte[] for termNumber lists.
for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24))
{
int lim = Math.Min(docbase + (1 << 16), maxDoc);
for (int doc = docbase; doc < lim; doc++)
{
//System.out.println(" pass=" + pass + " process docID=" + doc);
int val = index[doc];
if ((val & 0xff) == 1)
{
int len = (int)((uint)val >> 8);
//System.out.println(" ptr pos=" + pos);
index[doc] = (pos << 8) | 1; // change index to point to start of array
if ((pos & 0xff000000) != 0)
{
// we only have 24 bits for the array index
throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field);
}
sbyte[] arr = bytes[doc];
/*
for(byte b : arr) {
//System.out.println(" b=" + Integer.toHexString((int) b));
}
*/
bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
if (target.Length <= pos + len)
{
int newlen = target.Length;
/// <summary>
///* we don't have to worry about the array getting too large
/// since the "pos" param will overflow first (only 24 bits available)
/// if ((newlen<<1) <= 0) {
/// // overflow...
/// newlen = Integer.MAX_VALUE;
/// if (newlen <= pos + len) {
/// throw new SolrException(400,"Too many terms to uninvert field!");
/// }
/// } else {
/// while (newlen <= pos + len) newlen<<=1; // doubling strategy
/// }
/// ***
/// </summary>
while (newlen <= pos + len) // doubling strategy
{
newlen <<= 1;
}
sbyte[] newtarget = new sbyte[newlen];
Array.Copy(target, 0, newtarget, 0, pos);
target = newtarget;
}
Array.Copy(arr, 0, target, pos, len);
pos += len + 1; // skip single byte at end and leave it 0 for terminator
}
}
}
// shrink array
if (pos < target.Length)
{
sbyte[] newtarget = new sbyte[pos];
Array.Copy(target, 0, newtarget, 0, pos);
target = newtarget;
}
Tnums[pass] = target;
if ((pass << 16) > maxDoc)
{
break;
}
}
}
if (indexedTerms != null)
{
IndexedTermsArray = indexedTerms.ToArray();
}
long endTime = DateTime.Now.Millisecond;
Total_time = (int)(endTime - startTime);
Phase1_time = (int)(midPoint - startTime);
}