diff --git a/solrdf/src/main/java/org/gazzax/labs/solrdf/NTriples.java b/solrdf/src/main/java/org/gazzax/labs/solrdf/NTriples.java
index 5698900..f4a9ce6 100644
--- a/solrdf/src/main/java/org/gazzax/labs/solrdf/NTriples.java
+++ b/solrdf/src/main/java/org/gazzax/labs/solrdf/NTriples.java
@@ -7,7 +7,7 @@
import com.hp.hpl.jena.rdf.model.AnonId;
/**
- * Booch utility for parsing RDF resources.
+ * Booch utility for parsing NTriples.
*
* @author Andrea Gazzarini
* @since 1.0
diff --git a/solrdf/src/main/java/org/gazzax/labs/solrdf/Strings.java b/solrdf/src/main/java/org/gazzax/labs/solrdf/Strings.java
index a67e84e..105ebdf 100644
--- a/solrdf/src/main/java/org/gazzax/labs/solrdf/Strings.java
+++ b/solrdf/src/main/java/org/gazzax/labs/solrdf/Strings.java
@@ -27,6 +27,13 @@ public static boolean isNullOrEmpty(final String value) {
return value == null || value.trim().length() == 0;
}
+ /**
+ * Removes unnecessary decimal zeros from a numeric string.
+ * As an extreme case, where all decimals are 0, it ends with an integer string (e.g 10.000 = 10)
+ *
+ * @param numericStringValue the numeric string.
+ * @return a new string with unnecessary decimal zeros removed.
+ */
public static String round(final String numericStringValue) {
final int indexOfDot = numericStringValue.indexOf(".");
if (indexOfDot == -1) {
diff --git a/solrdf/src/main/java/org/gazzax/labs/solrdf/Utility.java b/solrdf/src/main/java/org/gazzax/labs/solrdf/Utility.java
deleted file mode 100644
index babba70..0000000
--- a/solrdf/src/main/java/org/gazzax/labs/solrdf/Utility.java
+++ /dev/null
@@ -1,121 +0,0 @@
-package org.gazzax.labs.solrdf;
-
-import com.google.common.hash.HashCode;
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-import com.hp.hpl.jena.graph.Node;
-
-/**
- * General-purposes Booch utility.
- *
- * @author Andrea Gazzarini
- * @since 1.0
- */
-public abstract class Utility {
- private static final HashFunction MURMUR_HASH_3 = Hashing.murmur3_128();
-
- /**
- * Returns the local name part of the given URI.
- * Basically here because I didn't understand how Node.getLocalName() works.
- *
- * @param uri the URI.
- * @return the local name part of the given URI.
- */
- public static String localName(final String uri) {
- return uri.substring(startIndexOfLocalName(uri) + 1);
- }
-
- /**
- * Returns the local name part of the given URI.
- * Basically here because I didn't understand how Node.getLocalName() works.
- *
- * @param uri the URI.
- * @return the local name part of the given URI.
- */
- public static String localName(final Node uri) {
- return localName(uri.getURI());
- }
-
- /**
- * Returns the domain part of the given URI.
- * Basically here because I didn't understand how Node.getNameSpace() works.
- *
- * @param uri the URI.
- * @return the domain part of the given URI.
- */
- public static String namespace(final String uri) {
- return uri.substring(0, startIndexOfLocalName(uri) + 1);
- }
-
- /**
- * Returns the domain part of the given URI.
- * Basically here because I didn't understand how Node.getNameSpace() works.
- *
- * @param uri the URI.
- * @return the domain part of the given URI.
- */
- public static String namespace(final Node uri) {
- return namespace(uri.getURI());
- }
-
- /**
- * Returns true if the given byte array represents a variable. The incoming
- * value is considered a variable if
- *
- *
- *
Array is null
- *
Array is not null but empty (i.e. size is 0)
- *
- *
- * @param value the value to be checked.
- * @return true if the given value represents a variable.
- */
- public static boolean isVariable(final byte[] value) {
- return value == null || value.length == 0;
- }
-
- /**
- * Returns true if the given value is a variable. A value is considered a
- * variable if it is null.
- *
- * @param value the value that will be checked.
- * @return true if the given value is a variable.
- */
- public static boolean isVariable(final Node value) {
- return value == null;
- }
-
- /**
- * Hashes the given byte[] with MurmurHash3.
- *
- * @param input the input
- * @return the hash of the input
- */
- public static HashCode murmurHash3(final byte[] input) {
- return MURMUR_HASH_3.hashBytes(input);
- }
-
- /**
- * Returns the start index of the local name part of the given URI.
- *
- * @param uri the URI.
- * @return the start index of the local name part of the given URI.
- */
- static int startIndexOfLocalName(final String uri) {
- int indexOfSeparator = uri.indexOf('#');
-
- if (indexOfSeparator == -1) {
- indexOfSeparator = uri.lastIndexOf('/');
- }
-
- if (indexOfSeparator == -1) {
- indexOfSeparator = uri.lastIndexOf(':');
- }
-
- if (indexOfSeparator == -1) {
- throw new IllegalArgumentException(uri);
- }
-
- return indexOfSeparator;
- }
-}
\ No newline at end of file
diff --git a/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/PerSegmentSingleValuedFaceting.java b/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/PerSegmentSingleValuedFaceting.java
index 52d48d0..87b21db 100644
--- a/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/PerSegmentSingleValuedFaceting.java
+++ b/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/PerSegmentSingleValuedFaceting.java
@@ -1,4 +1,5 @@
package org.gazzax.labs.solrdf.handler.search.faceting;
+
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
@@ -31,350 +32,376 @@
import org.apache.solr.util.BoundedTreeSet;
/**
- * A class that generates facet information for a given request. Note that it
- * extends the already existing {@link SimpleFacets} in order to reuse that
- * logic as much as possible.
+ * A class that generates facet information for a given request.
*
- * @author Andrea Gazzarini
- * @since 1.0
+ * There is a similar class within {@link SimpleFacets} that unfortunately has a default visibility and
+ * therefore cannot be used within {@link RDFacets} (being this latter in a different package).
*/
-// FIXME this depends on SimpleFacets (several static method calls)
public class PerSegmentSingleValuedFaceting {
-
- // input params
- SolrIndexSearcher searcher;
- DocSet docs;
- String fieldName;
- int offset;
- int limit;
- int mincount;
- boolean missing;
- String sort;
- String prefix;
-
- Filter baseSet;
-
- int nThreads;
-
- public PerSegmentSingleValuedFaceting(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort, String prefix) {
- this.searcher = searcher;
- this.docs = docs;
- this.fieldName = fieldName;
- this.offset = offset;
- this.limit = limit;
- this.mincount = mincount;
- this.missing = missing;
- this.sort = sort;
- this.prefix = prefix;
- }
-
- public void setNumThreads(int threads) {
- nThreads = threads;
- }
-
-
- NamedList getFacetCounts(Executor executor) throws IOException {
-
- CompletionService completionService = new ExecutorCompletionService<>(executor);
-
- // reuse the translation logic to go from top level set to per-segment set
- baseSet = docs.getTopFilter();
-
- final List leaves = searcher.getTopReaderContext().leaves();
- // The list of pending tasks that aren't immediately submitted
- // TODO: Is there a completion service, or a delegating executor that can
- // limit the number of concurrent tasks submitted to a bigger executor?
- LinkedList> pending = new LinkedList<>();
-
- int threads = nThreads <= 0 ? Integer.MAX_VALUE : nThreads;
-
- for (final AtomicReaderContext leave : leaves) {
- final SegFacet segFacet = new SegFacet(leave);
-
- Callable task = new Callable() {
- @Override
- public SegFacet call() throws Exception {
- segFacet.countTerms();
- return segFacet;
- }
- };
-
- // TODO: if limiting threads, submit by largest segment first?
-
- if (--threads >= 0) {
- completionService.submit(task);
- } else {
- pending.add(task);
- }
- }
-
-
- // now merge the per-segment results
- PriorityQueue queue = new PriorityQueue(leaves.size()) {
- @Override
- protected boolean lessThan(SegFacet a, SegFacet b) {
- return a.tempBR.compareTo(b.tempBR) < 0;
- }
- };
-
-
- boolean hasMissingCount=false;
- int missingCount=0;
- for (int i=0, c=leaves.size(); i future = completionService.take();
- seg = future.get();
- if (!pending.isEmpty()) {
- completionService.submit(pending.removeFirst());
- }
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
- } catch (ExecutionException e) {
- Throwable cause = e.getCause();
- if (cause instanceof RuntimeException) {
- throw (RuntimeException)cause;
- } else {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error in per-segment faceting on field: " + fieldName, cause);
- }
- }
-
-
- if (seg.startTermIndex < seg.endTermIndex) {
- if (seg.startTermIndex==-1) {
- hasMissingCount=true;
- missingCount += seg.counts[0];
- seg.pos = 0;
- } else {
- seg.pos = seg.startTermIndex;
- }
- if (seg.pos < seg.endTermIndex) {
- seg.tenum = seg.si.termsEnum();
- seg.tenum.seekExact(seg.pos);
- seg.tempBR = seg.tenum.term();
- queue.add(seg);
- }
- }
- }
-
- FacetCollector collector;
- if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
- collector = new CountSortedFacetCollector(offset, limit, mincount);
- } else {
- collector = new IndexSortedFacetCollector(offset, limit, mincount);
- }
-
- BytesRefBuilder val = new BytesRefBuilder();
-
- while (queue.size() > 0) {
- SegFacet seg = queue.top();
-
- // we will normally end up advancing the term enum for this segment
- // while still using "val", so we need to make a copy since the BytesRef
- // may be shared across calls.
- val.copyBytes(seg.tempBR);
-
- int count = 0;
-
- do {
- count += seg.counts[seg.pos - seg.startTermIndex];
-
- // TODO: OPTIMIZATION...
- // if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry.
- seg.pos++;
- if (seg.pos >= seg.endTermIndex) {
- queue.pop();
- seg = queue.top();
- } else {
- seg.tempBR = seg.tenum.next();
- seg = queue.updateTop();
- }
- } while (seg != null && val.get().compareTo(seg.tempBR) == 0);
-
- boolean stop = collector.collect(val.get(), count);
- if (stop) break;
- }
-
- NamedList res = collector.getFacetCounts();
-
- // convert labels to readable form
- FieldType ft = searcher.getSchema().getFieldType(fieldName);
- int sz = res.size();
- for (int i=0; i 0) {
+ SegFacet seg = queue.top();
+
+ // we will normally end up advancing the term enum for this segment
+ // while still using "val", so we need to make a copy since the
+ // BytesRef
+ // may be shared across calls.
+ val.copyBytes(seg.tempBR);
+
+ int count = 0;
+
+ do {
+ count += seg.counts[seg.pos - seg.startTermIndex];
+
+ // TODO: OPTIMIZATION...
+ // if mincount>0 then seg.pos++ can skip ahead to the next
+ // non-zero entry.
+ seg.pos++;
+ if (seg.pos >= seg.endTermIndex) {
+ queue.pop();
+ seg = queue.top();
+ } else {
+ seg.tempBR = seg.tenum.next();
+ seg = queue.updateTop();
+ }
+ } while (seg != null && val.get().compareTo(seg.tempBR) == 0);
+
+ boolean stop = collector.collect(val.get(), count);
+ if (stop)
+ break;
+ }
+
+ NamedList res = collector.getFacetCounts();
+
+ // convert labels to readable form
+ FieldType ft = searcher.getSchema().getFieldType(fieldName);
+ int sz = res.size();
+ for (int i = 0; i < sz; i++) {
+ res.setName(i, ft.indexedToReadable(res.getName(i)));
+ }
+
+ if (missing) {
+ if (!hasMissingCount) {
+ missingCount = SimpleFacets.getFieldMissingCount(searcher, docs, fieldName);
+ }
+ res.add(null, missingCount);
+ }
+
+ return res;
+ }
+
+ /**
+ * A Segment facet.
+ */
+ class SegFacet {
+ final AtomicReaderContext context;
+
+ /**
+ * Builds a new {@link SegFacet} with the given context.
+ *
+ * @param context the segment context.
+ */
+ SegFacet(final AtomicReaderContext context) {
+ this.context = context;
+ }
+
+ SortedDocValues si;
+ int startTermIndex;
+ int endTermIndex;
+ int[] counts;
+
+ int pos; // only used when merging
+ TermsEnum tenum; // only used when merging
+
+ BytesRef tempBR = new BytesRef();
+
+ void countTerms() throws IOException {
+ si = FieldCache.DEFAULT.getTermsIndex(context.reader(), fieldName);
+
+ if (prefix != null) {
+ final BytesRefBuilder prefixRef = new BytesRefBuilder();
+ prefixRef.copyChars(prefix);
+ startTermIndex = si.lookupTerm(prefixRef.get());
+ if (startTermIndex < 0) {
+ startTermIndex = -startTermIndex - 1;
+ }
+ prefixRef.append(UnicodeUtil.BIG_TERM);
+ endTermIndex = si.lookupTerm(prefixRef.get());
+ assert endTermIndex < 0;
+ endTermIndex = -endTermIndex - 1;
+ } else {
+ startTermIndex = -1;
+ endTermIndex = si.getValueCount();
+ }
+
+ final int nTerms = endTermIndex - startTermIndex;
+ if (nTerms > 0) {
+ final int[] counts = this.counts = new int[nTerms];
+ final DocIdSet idSet = baseSet.getDocIdSet(context, null);
+ final DocIdSetIterator iterator = idSet.iterator();
+
+ int doc;
+
+ if (prefix == null) {
+ while ((doc = iterator.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
+ counts[1 + si.getOrd(doc)]++;
+ }
+ } else {
+ while ((doc = iterator.nextDoc()) < DocIdSetIterator.NO_MORE_DOCS) {
+ int term = si.getOrd(doc);
+ int arrIdx = term - startTermIndex;
+ if (arrIdx >= 0 && arrIdx < nTerms)
+ counts[arrIdx]++;
+ }
+ }
+ }
+ }
+ }
}
-
-
abstract class FacetCollector {
- /*** return true to stop collection */
- public abstract boolean collect(BytesRef term, int count);
- public abstract NamedList getFacetCounts();
-}
+ /*** return true to stop collection */
+ public abstract boolean collect(BytesRef term, int count);
+ public abstract NamedList getFacetCounts();
+}
-// This collector expects facets to be collected in index order
class CountSortedFacetCollector extends FacetCollector {
- private final CharsRefBuilder spare = new CharsRefBuilder();
-
- final int offset;
- final int limit;
- final int maxsize;
- final BoundedTreeSet> queue;
-
- int min; // the smallest value in the top 'N' values
-
- public CountSortedFacetCollector(int offset, int limit, int mincount) {
- this.offset = offset;
- this.limit = limit;
- maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
- queue = new BoundedTreeSet<>(maxsize);
- min=mincount-1; // the smallest value in the top 'N' values
- }
-
- @Override
- public boolean collect(BytesRef term, int count) {
- if (count > min) {
- // NOTE: we use c>min rather than c>=min as an optimization because we are going in
- // index order, so we already know that the keys are ordered. This can be very
- // important if a lot of the counts are repeated (like zero counts would be).
- spare.copyUTF8Bytes(term);
- queue.add(new SimpleFacets.CountPair<>(spare.toString(), count));
- if (queue.size()>=maxsize) min=queue.last().val;
- }
- return false;
- }
-
- @Override
- public NamedList getFacetCounts() {
- NamedList res = new NamedList<>();
- int off=offset;
- int lim=limit>=0 ? limit : Integer.MAX_VALUE;
- // now select the right page from the results
- for (SimpleFacets.CountPair p : queue) {
- if (--off>=0) continue;
- if (--lim<0) break;
- res.add(p.key, p.val);
- }
- return res;
- }
+ private final CharsRefBuilder spare = new CharsRefBuilder();
+
+ final int offset;
+ final int limit;
+ final int maxsize;
+ final BoundedTreeSet> queue;
+
+ int min; // the smallest value in the top 'N' values
+
+ public CountSortedFacetCollector(int offset, int limit, int mincount) {
+ this.offset = offset;
+ this.limit = limit;
+ maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1;
+ queue = new BoundedTreeSet<>(maxsize);
+ min = mincount - 1; // the smallest value in the top 'N' values
+ }
+
+ @Override
+ public boolean collect(BytesRef term, int count) {
+ if (count > min) {
+ // NOTE: we use c>min rather than c>=min as an optimization because
+ // we are going in
+ // index order, so we already know that the keys are ordered. This
+ // can be very
+ // important if a lot of the counts are repeated (like zero counts
+ // would be).
+ spare.copyUTF8Bytes(term);
+ queue.add(new SimpleFacets.CountPair<>(spare.toString(), count));
+ if (queue.size() >= maxsize) {
+ min = queue.last().val;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public NamedList getFacetCounts() {
+ NamedList res = new NamedList<>();
+ int off = offset;
+ int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
+ // now select the right page from the results
+ for (SimpleFacets.CountPair p : queue) {
+ if (--off >= 0)
+ continue;
+ if (--lim < 0)
+ break;
+ res.add(p.key, p.val);
+ }
+ return res;
+ }
}
// This collector expects facets to be collected in index order
class IndexSortedFacetCollector extends FacetCollector {
- private final CharsRefBuilder spare = new CharsRefBuilder();
-
- int offset;
- int limit;
- final int mincount;
- final NamedList res = new NamedList<>();
-
- public IndexSortedFacetCollector(int offset, int limit, int mincount) {
- this.offset = offset;
- this.limit = limit>0 ? limit : Integer.MAX_VALUE;
- this.mincount = mincount;
- }
-
- @Override
- public boolean collect(BytesRef term, int count) {
- if (count < mincount) {
- return false;
- }
-
- if (offset > 0) {
- offset--;
- return false;
- }
-
- if (limit > 0) {
- spare.copyUTF8Bytes(term);
- res.add(spare.toString(), count);
- limit--;
- }
-
- return limit <= 0;
- }
-
- @Override
- public NamedList getFacetCounts() {
- return res;
- }
+ private final CharsRefBuilder spare = new CharsRefBuilder();
+
+ int offset;
+ int limit;
+ final int mincount;
+ final NamedList res = new NamedList<>();
+
+ public IndexSortedFacetCollector(int offset, int limit, int mincount) {
+ this.offset = offset;
+ this.limit = limit > 0 ? limit : Integer.MAX_VALUE;
+ this.mincount = mincount;
+ }
+
+ @Override
+ public boolean collect(BytesRef term, int count) {
+ if (count < mincount) {
+ return false;
+ }
+
+ if (offset > 0) {
+ offset--;
+ return false;
+ }
+
+ if (limit > 0) {
+ spare.copyUTF8Bytes(term);
+ res.add(spare.toString(), count);
+ limit--;
+ }
+
+ return limit <= 0;
+ }
+
+ @Override
+ public NamedList getFacetCounts() {
+ return res;
+ }
}
diff --git a/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/RDFacets.java b/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/RDFacets.java
index 4e72ef0..2104b2e 100644
--- a/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/RDFacets.java
+++ b/solrdf/src/main/java/org/gazzax/labs/solrdf/handler/search/faceting/RDFacets.java
@@ -1,5 +1,7 @@
package org.gazzax.labs.solrdf.handler.search.faceting;
+import static org.gazzax.labs.solrdf.Strings.isNotNullOrEmptyString;
+
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
@@ -55,8 +57,6 @@
import org.gazzax.labs.solrdf.log.MessageFactory;
import org.slf4j.LoggerFactory;
-import static org.gazzax.labs.solrdf.Strings.*;
-
/**
* A class that generates facet information for a given request. Note that it
* extends the already existing {@link SimpleFacets} in order to reuse that
@@ -118,11 +118,11 @@ public NamedList