Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Fix searches with standalone punctuation #701

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions solr/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,24 @@
catenateAll="1"
/>
<filter class="solr.ICUFoldingFilterFactory" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_punctuation.txt" />
</analyzer>
</fieldType>

<!-- single token with punctuation terms removed so dismax doesn't look for punctuation terms in these fields -->
<!-- On client side, Lucene query parser breaks things up by whitespace *before* field analysis for dismax -->
<!-- so punctuation terms (& : ;) are stopwords to allow results from other fields when these chars are surrounded by spaces in query -->
<!-- do not lowercase -->
<fieldType name="string_punct_stop" class="solr.TextField" omitNorms="true">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory" />
<filter class="solr.ICUFoldingFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory" />
<filter class="solr.ICUFoldingFilterFactory" />
<!-- removing punctuation for Lucene query parser issues -->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_punctuation.txt" />
</analyzer>
</fieldType>

Expand Down Expand Up @@ -292,6 +310,8 @@
<field name="text" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="unitid_identifier_match" type="identifier_match" indexed="true" stored="false" multiValued="true" />
<field name="ead_identifier_match" type="identifier_match" indexed="true" stored="false" multiValued="true" />
<field name="ref_identifier_match" type="identifier_match" indexed="true" stored="false" multiValued="true" />
<field name="id_search" type="string_punct_stop" indexed="true" stored="false" multiValued="false" />

<field name="_root_" type="string" indexed="true" stored="true" docValues="false" />
<field name="_nest_parent_" type="string" indexed="true" stored="true"/>
Expand Down Expand Up @@ -386,9 +406,12 @@
<copyField source="separatedmaterial_tesim" dest="text" />
<copyField source="userestrict_tesim" dest="text" />
<!-- grab structured data that's important -->
<copyField source="id" dest="id_search" />
<copyField source="unitid_ssm" dest="text" />
<copyField source="unitid_ssm" dest="unitid_identifier_match" />
<copyField source="ead_ssi" dest="ead_identifier_match" />
<copyField source="ref_ssm" dest="ref_identifier_match" />


<!-- sort fields -->
<copyField source="normalized_title_ssm" dest="title_sort"/> <!-- TODO: assumes single values -->
Expand Down
24 changes: 8 additions & 16 deletions solr/conf/solrconfig.xml
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,8 @@
name_teim^10
place_teim^10
subject_teim^2
id
ead_ssi
ref_ssm
unitid_ssm
id_search
ref_identifier_match
container_teim
parent_unittitles_tesim
text
Expand All @@ -124,10 +122,8 @@
name_teim^20
place_teim^20
subject_teim^5
id^2
ead_ssi^2
ref_ssm^2
unitid_ssm^2
id_search^2
ref_identifier_match^2
container_teim^2
parent_unittitles_tesim^2
text^2
Expand All @@ -140,19 +136,15 @@
container_teim^2
</str>
<str name="qf_identifier">
id
ead_ssi
id_search
ead_identifier_match
ref_ssm
unitid_ssm
ref_identifier_match
unitid_identifier_match
</str>
<str name="pf_identifier">
id^2
ead_ssi^2
id_search^2
ead_identifier_match^2
ref_ssm^2
unitid_ssm^2
ref_identifier_match^2
unitid_identifier_match^2
</str>
<str name="qf_name">
Expand Down
23 changes: 23 additions & 0 deletions solr/conf/stopwords_punctuation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Punctuation characters we want to ignore as terms (i.e., when surrounded
# by whitespace in a query, like 'fred : the puppy') in queries
# ONLY FOR SINGLE TOKEN ANALYZED FIELDS
# see https://issues.apache.org/jira/browse/SOLR-3085
# Note that plusses and double hyphens are not treated as terms
# per debugQuery
:
;
&
/
=
>
<
,
.
(
)
»
§
·
-