Skip to content

Add a new highlighter based on lucene's match highlighter #92068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/92068.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 92068
summary: Add a new highlighter based on lucene's match highlighter
area: Highlighting
type: feature
issues: []
Original file line number Diff line number Diff line change
@@ -64,6 +64,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;

import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
@@ -106,7 +107,7 @@

public class HighlighterSearchIT extends ESIntegTestCase {
// TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified" };
private static final String[] ALL_TYPES = new String[] { "plain", "fvh", "unified", "matches" };

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At some point we should rewrite at least some of this as an abstract highlighter test case which each highlighter implementation can then test itself against.

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
@@ -3508,6 +3509,9 @@ public void testWithNestedQuery() throws Exception {
// but we highlight the root text field since nested documents cannot be highlighted with postings nor term vectors
// directly.
for (String type : ALL_TYPES) {
if (Objects.equals("matches", type)) {
continue; // matches highlighter doesn't support nested fields
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... yet

SearchResponse searchResponse = client().prepareSearch()
.setQuery(nestedQuery("foo", prefixQuery("foo.text", "bro"), ScoreMode.None))
.highlighter(new HighlightBuilder().field(new Field("text").highlighterType(type).requireFieldMatch(false)))
Original file line number Diff line number Diff line change
@@ -219,6 +219,7 @@
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightPhase;
import org.elasticsearch.search.fetch.subphase.highlight.Highlighter;
import org.elasticsearch.search.fetch.subphase.highlight.MatchesHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.PlainHighlighter;
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
import org.elasticsearch.search.internal.ShardSearchRequest;
@@ -863,6 +864,7 @@ private static Map<String, Highlighter> setupHighlighters(Settings settings, Lis
highlighters.register("fvh", new FastVectorHighlighter(settings));
highlighters.register("plain", new PlainHighlighter());
highlighters.register("unified", new UnifiedHighlighter());
highlighters.register("matches", new MatchesHighlighter());
highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);

return unmodifiableMap(highlighters.getRegistry());
Original file line number Diff line number Diff line change
@@ -54,6 +54,10 @@ public HighlightField(String name, Text[] fragments) {
this.fragments = fragments;
}

public HighlightField(String name, List<String> fragments) {
this(name, fragments.stream().map(Text::new).toArray(Text[]::new));
}

/**
* The name of the field highlighted.
*/
Original file line number Diff line number Diff line change
@@ -40,7 +40,6 @@ public FetchSubPhaseProcessor getProcessor(FetchContext context) {
if (context.highlight() == null) {
return null;
}

return getProcessor(context, context.highlight(), context.parsedQuery().query());
}

@@ -161,10 +160,7 @@ private FieldContext contextBuilders(
)
);
}
// TODO in future we can load the storedFields in advance here and make use of them,
// but for now they are loaded separately in HighlightUtils so we only return whether
// or not we need source.
storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, Set.of()));
storedFieldsSpec = storedFieldsSpec.merge(new StoredFieldsSpec(sourceRequired, false, storedFields));
}
return new FieldContext(storedFieldsSpec, builders);
}
Original file line number Diff line number Diff line change
@@ -10,19 +10,14 @@
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.elasticsearch.index.fieldvisitor.CustomFieldsVisitor;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.fetch.FetchSubPhase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;

import static java.util.Collections.singleton;

public final class HighlightUtils {

@@ -43,11 +38,8 @@ public static List<Object> loadFieldValues(
FetchSubPhase.HitContext hitContext,
boolean forceSource
) throws IOException {
if (forceSource == false && fieldType.isStored()) {
CustomFieldsVisitor fieldVisitor = new CustomFieldsVisitor(singleton(fieldType.name()), false);
hitContext.reader().document(hitContext.docId(), fieldVisitor);
List<Object> textsToHighlight = fieldVisitor.fields().get(fieldType.name());
return Objects.requireNonNullElse(textsToHighlight, Collections.emptyList());
if (forceSource == false && hitContext.loadedFields().containsKey(fieldType.name())) {
return hitContext.loadedFields().get(fieldType.name());
}
ValueFetcher fetcher = fieldType.valueFetcher(searchContext, null);
fetcher.setNextReader(hitContext.readerContext());
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.search.fetch.subphase.highlight;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.FilterMatchesIterator;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.MatchesUtils;
import org.apache.lucene.search.matchhighlight.OffsetRange;
import org.apache.lucene.search.matchhighlight.OffsetsFromTokens;
import org.apache.lucene.search.matchhighlight.OffsetsRetrievalStrategy;
import org.apache.lucene.search.matchhighlight.Passage;
import org.apache.lucene.search.matchhighlight.PassageFormatter;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.mapper.TextSearchInfo;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

/**
* Highlights individual fields using components from lucene's match highlighter
*/
class MatchesFieldHighlighter {

private final FieldHighlightContext context;
private final Matches matches;
private final Analyzer analyzer;
private final String field;

MatchesFieldHighlighter(FieldHighlightContext context, MatchesHighlighterState state) throws IOException {
this.context = context;
// TODO term vectors and require_field_match=false should intercept things here
this.matches = state.getMatches(context.query, context.hitContext.readerContext(), context.hitContext.docId());
this.analyzer = context.context.getSearchExecutionContext().getIndexAnalyzer(s -> Lucene.STANDARD_ANALYZER);
this.field = context.fieldType.name();
}

/**
* @return a MatchesIterator for this field, based on the field highlighter configuration
*/
MatchesIterator getMatchesIterator() throws IOException {
if (this.matches == null) {
return null;
}

Set<String> matchFields = context.field.fieldOptions().matchedFields();
if (matchFields == null || matchFields.isEmpty()) {
matchFields = Set.of(field);
}

List<MatchesIterator> fieldIterators = new ArrayList<>();
for (String field : matchFields) {
MatchesIterator it = this.matches.getMatches(field);
if (it != null) {
fieldIterators.add(it);
}
}
return MatchesUtils.disjunction(fieldIterators);
}

/**
* Uses a MatchesIterator to highlight a list of source inputs
*/
public List<String> buildHighlights(MatchesIterator it, List<CharSequence> sourceValues) throws IOException {
String contiguousSourceText = buildContiguousSourceText(sourceValues);
OffsetsRetrievalStrategy offsetsStrategy = getOffsetStrategy();
List<OffsetRange> matchRanges = offsetsStrategy.get(it, f -> sourceValues);
List<OffsetRange> sourceRanges = computeValueRanges(sourceValues);
XPassageSelector passageSelector = new XPassageSelector(); // TODO word break stuff goes here
PassageFormatter formatter = new PassageFormatter(
"...",
context.field.fieldOptions().preTags()[0],
context.field.fieldOptions().postTags()[0]
); // TODO multiple field markers a la FVH
List<Passage> passages = passageSelector.pickBest(
contiguousSourceText,
matchRanges,
context.field.fieldOptions().fragmentCharSize(),
context.field.fieldOptions().numberOfFragments(),
sourceRanges
);
return formatter.format(contiguousSourceText, passages, sourceRanges);
}

private OffsetsRetrievalStrategy getOffsetStrategy() {
TextSearchInfo tsi = context.fieldType.getTextSearchInfo();
// TODO termvectors
return switch (tsi.luceneFieldType().indexOptions()) {
case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS -> new XOffsetsFromMatchIterator(
field,
new XOffsetsFromPositions(field, analyzer)
);
case DOCS_AND_FREQS_AND_POSITIONS -> limitOffsets(new XOffsetsFromPositions(field, analyzer));
case DOCS_AND_FREQS, DOCS -> new OffsetsFromTokens(field, analyzer);
// This should be unreachable because we won't get a MatchesIterator from an unindexed field
case NONE -> (matchesIterator, doc) -> { throw new IllegalStateException("Field [ " + field + "] is not indexed"); };
};
}

// TODO might be more sensible to push this back into OffsetsFromPositions
private OffsetsRetrievalStrategy limitOffsets(OffsetsRetrievalStrategy in) {
if (context.field.fieldOptions().maxAnalyzedOffset() == null) {
return in;
}
return (matchesIterator, doc) -> {
int positionCutOff = context.field.fieldOptions().maxAnalyzedOffset() / 5;
MatchesIterator wrapped = new FilterMatchesIterator(matchesIterator) {
@Override
public boolean next() throws IOException {
if (matchesIterator.next() == false) {
return false;
}
return matchesIterator.startPosition() <= positionCutOff;
}
};
return in.get(wrapped, doc);
};
}

private String buildContiguousSourceText(List<CharSequence> values) {
String value;
if (values.size() == 1) {
value = values.get(0).toString();
} else {
// TODO: This can be inefficient if offset gap is large but the logic
// of applying offsets would get much more complicated so leaving for now
// (would have to recalculate all offsets to omit gaps).
String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
value = String.join(fieldGapPadding, values);
}
return value;
}

private List<OffsetRange> computeValueRanges(List<CharSequence> values) {
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
int offset = 0;
for (CharSequence v : values) {
valueRanges.add(new OffsetRange(offset, offset + v.length()));
offset += v.length();
offset += analyzer.getOffsetGap(field);
}
return valueRanges;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.search.fetch.subphase.highlight;

import org.apache.lucene.search.MatchesIterator;
import org.elasticsearch.index.mapper.MappedFieldType;

import java.io.IOException;
import java.util.List;

/**
* A highlighter that uses the output of a query's Matches to highlight tokens
*/
public class MatchesHighlighter implements Highlighter {

private static final String MATCHES_HIGHLIGHTER_CONFIG_KEY = "matches_highlighter_config_key";

@Override
public boolean canHighlight(MappedFieldType fieldType) {
return true;
}

@Override
public HighlightField highlight(FieldHighlightContext fieldContext) throws IOException {

MatchesHighlighterState state = (MatchesHighlighterState) fieldContext.cache.computeIfAbsent(
MATCHES_HIGHLIGHTER_CONFIG_KEY,
k -> new MatchesHighlighterState(fieldContext.context.searcher().getIndexReader())
);

MatchesFieldHighlighter fieldHighlighter = new MatchesFieldHighlighter(fieldContext, state);

MatchesIterator it = fieldHighlighter.getMatchesIterator();
if (it == null) {
return null;
}

List<CharSequence> sourceValues = HighlightUtils.loadFieldValues(
fieldContext.fieldType,
fieldContext.context.getSearchExecutionContext(),
fieldContext.hitContext,
fieldContext.forceSource
).stream().map(v -> (CharSequence) v.toString()).toList();

List<String> highlights = fieldHighlighter.buildHighlights(it, sourceValues);
return new HighlightField(fieldContext.fieldName, highlights);
}
}
Loading