878 lines
25 KiB
Python
878 lines
25 KiB
Python
# SPDX-License-Identifier: MIT
|
|
# Copyright (C) 2022 Max Bachmann
|
|
from __future__ import annotations
|
|
|
|
from math import ceil
|
|
|
|
from rapidfuzz._common_py import conv_sequences
|
|
from rapidfuzz._utils import ScorerFlag, add_scorer_attrs, is_none, setupPandas
|
|
from rapidfuzz.distance import ScoreAlignment
|
|
from rapidfuzz.distance.Indel_py import (
|
|
_block_normalized_similarity as indel_block_normalized_similarity,
|
|
distance as indel_distance,
|
|
normalized_similarity as indel_normalized_similarity,
|
|
)
|
|
|
|
|
|
def get_scorer_flags_fuzz(**_kwargs):
|
|
return {
|
|
"optimal_score": 100,
|
|
"worst_score": 0,
|
|
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
|
|
}
|
|
|
|
|
|
fuzz_attribute = {"get_scorer_flags": get_scorer_flags_fuzz}
|
|
|
|
|
|
def _norm_distance(dist, lensum, score_cutoff):
|
|
score = (100 - 100 * dist / lensum) if lensum else 100
|
|
return score if score >= score_cutoff else 0
|
|
|
|
|
|
def _split_sequence(seq):
|
|
if isinstance(seq, (str, bytes)):
|
|
return seq.split()
|
|
|
|
splitted_seq = [[]]
|
|
for x in seq:
|
|
ch = x if isinstance(x, str) else chr(x)
|
|
if ch.isspace():
|
|
splitted_seq.append([])
|
|
else:
|
|
splitted_seq[-1].append(x)
|
|
|
|
return [tuple(x) for x in splitted_seq if x]
|
|
|
|
|
|
def _join_splitted_sequence(seq_list):
|
|
if not seq_list:
|
|
return ""
|
|
if isinstance(next(iter(seq_list)), str):
|
|
return " ".join(seq_list)
|
|
if isinstance(next(iter(seq_list)), bytes):
|
|
return b" ".join(seq_list)
|
|
|
|
joined = []
|
|
for seq in seq_list:
|
|
joined += seq
|
|
joined += [ord(" ")]
|
|
return joined[:-1]
|
|
|
|
|
|
def ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Calculates the normalized Indel similarity.
|
|
|
|
Parameters
|
|
----------
|
|
s1 : Sequence[Hashable]
|
|
First string to compare.
|
|
s2 : Sequence[Hashable]
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
See Also
|
|
--------
|
|
rapidfuzz.distance.Indel.normalized_similarity : Normalized Indel similarity
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/ratio.svg
|
|
|
|
Examples
|
|
--------
|
|
>>> fuzz.ratio("this is a test", "this is a test!")
|
|
96.55171966552734
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if score_cutoff is not None:
|
|
score_cutoff /= 100
|
|
|
|
score = indel_normalized_similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
|
return score * 100
|
|
|
|
|
|
def _partial_ratio_impl(s1, s2, score_cutoff):
|
|
"""
|
|
implementation of partial_ratio. This assumes len(s1) <= len(s2).
|
|
"""
|
|
s1_char_set = set(s1)
|
|
len1 = len(s1)
|
|
len2 = len(s2)
|
|
|
|
res = ScoreAlignment(0, 0, len1, 0, len1)
|
|
|
|
block = {}
|
|
block_get = block.get
|
|
x = 1
|
|
for ch1 in s1:
|
|
block[ch1] = block_get(ch1, 0) | x
|
|
x <<= 1
|
|
|
|
for i in range(1, len1):
|
|
substr_last = s2[i - 1]
|
|
if substr_last not in s1_char_set:
|
|
continue
|
|
|
|
# todo cache map
|
|
ls_ratio = indel_block_normalized_similarity(block, s1, s2[:i], score_cutoff=score_cutoff)
|
|
if ls_ratio > res.score:
|
|
res.score = score_cutoff = ls_ratio
|
|
res.dest_start = 0
|
|
res.dest_end = i
|
|
if res.score == 1:
|
|
res.score = 100
|
|
return res
|
|
|
|
for i in range(len2 - len1):
|
|
substr_last = s2[i + len1 - 1]
|
|
if substr_last not in s1_char_set:
|
|
continue
|
|
|
|
# todo cache map
|
|
ls_ratio = indel_block_normalized_similarity(block, s1, s2[i : i + len1], score_cutoff=score_cutoff)
|
|
if ls_ratio > res.score:
|
|
res.score = score_cutoff = ls_ratio
|
|
res.dest_start = i
|
|
res.dest_end = i + len1
|
|
if res.score == 1:
|
|
res.score = 100
|
|
return res
|
|
|
|
for i in range(len2 - len1, len2):
|
|
substr_first = s2[i]
|
|
if substr_first not in s1_char_set:
|
|
continue
|
|
|
|
# todo cache map
|
|
ls_ratio = indel_block_normalized_similarity(block, s1, s2[i:], score_cutoff=score_cutoff)
|
|
if ls_ratio > res.score:
|
|
res.score = score_cutoff = ls_ratio
|
|
res.dest_start = i
|
|
res.dest_end = len2
|
|
if res.score == 1:
|
|
res.score = 100
|
|
return res
|
|
|
|
res.score *= 100
|
|
return res
|
|
|
|
|
|
def partial_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Searches for the optimal alignment of the shorter string in the
|
|
longer string and returns the fuzz.ratio for this alignment.
|
|
|
|
Parameters
|
|
----------
|
|
s1 : Sequence[Hashable]
|
|
First string to compare.
|
|
s2 : Sequence[Hashable]
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
Depending on the length of the needle (shorter string) different
|
|
implementations are used to improve the performance.
|
|
|
|
short needle (length ≤ 64):
|
|
When using a short needle length the fuzz.ratio is calculated for all
|
|
alignments that could result in an optimal alignment. It is
|
|
guaranteed to find the optimal alignment. For short needles this is very
|
|
fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst
|
|
case performance of ``O(NM)``.
|
|
|
|
.. image:: img/partial_ratio_short_needle.svg
|
|
|
|
long needle (length > 64):
|
|
For long needles a similar implementation to FuzzyWuzzy is used.
|
|
This implementation only considers alignments which start at one
|
|
of the longest common substrings. This results in a worst case performance
|
|
of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.
|
|
The following Python code shows the concept:
|
|
|
|
.. code-block:: python
|
|
|
|
blocks = SequenceMatcher(None, needle, longer, False).get_matching_blocks()
|
|
score = 0
|
|
for block in blocks:
|
|
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
|
|
long_end = long_start + len(shorter)
|
|
long_substr = longer[long_start:long_end]
|
|
score = max(score, fuzz.ratio(needle, long_substr))
|
|
|
|
This is a lot faster than checking all possible alignments. However it
|
|
only finds one of the best alignments and not necessarily the optimal one.
|
|
|
|
.. image:: img/partial_ratio_long_needle.svg
|
|
|
|
Examples
|
|
--------
|
|
>>> fuzz.partial_ratio("this is a test", "this is a test!")
|
|
100.0
|
|
"""
|
|
alignment = partial_ratio_alignment(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
|
if alignment is None:
|
|
return 0
|
|
|
|
return alignment.score
|
|
|
|
|
|
def partial_ratio_alignment(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Searches for the optimal alignment of the shorter string in the
|
|
longer string and returns the fuzz.ratio and the corresponding
|
|
alignment.
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str | bytes
|
|
First string to compare.
|
|
s2 : str | bytes
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff None is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
alignment : ScoreAlignment, optional
|
|
alignment between s1 and s2 with the score as a float between 0 and 100
|
|
|
|
Examples
|
|
--------
|
|
>>> s1 = "a certain string"
|
|
>>> s2 = "cetain"
|
|
>>> res = fuzz.partial_ratio_alignment(s1, s2)
|
|
>>> res
|
|
ScoreAlignment(score=83.33333333333334, src_start=2, src_end=8, dest_start=0, dest_end=6)
|
|
|
|
Using the alignment information it is possible to calculate the same fuzz.ratio
|
|
|
|
>>> fuzz.ratio(s1[res.src_start:res.src_end], s2[res.dest_start:res.dest_end])
|
|
83.33333333333334
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return None
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
if score_cutoff is None:
|
|
score_cutoff = 0
|
|
|
|
if not s1 and not s2:
|
|
return ScoreAlignment(100.0, 0, 0, 0, 0)
|
|
s1, s2 = conv_sequences(s1, s2)
|
|
len1 = len(s1)
|
|
len2 = len(s2)
|
|
if len1 <= len2:
|
|
shorter = s1
|
|
longer = s2
|
|
else:
|
|
shorter = s2
|
|
longer = s1
|
|
|
|
res = _partial_ratio_impl(shorter, longer, score_cutoff / 100)
|
|
if res.score != 100 and len1 == len2:
|
|
score_cutoff = max(score_cutoff, res.score)
|
|
res2 = _partial_ratio_impl(longer, shorter, score_cutoff / 100)
|
|
if res2.score > res.score:
|
|
res = ScoreAlignment(res2.score, res2.dest_start, res2.dest_end, res2.src_start, res2.src_end)
|
|
|
|
if res.score < score_cutoff:
|
|
return None
|
|
|
|
if len1 <= len2:
|
|
return res
|
|
|
|
return ScoreAlignment(res.score, res.dest_start, res.dest_end, res.src_start, res.src_end)
|
|
|
|
|
|
def token_sort_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Sorts the words in the strings and calculates the fuzz.ratio between them
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/token_sort_ratio.svg
|
|
|
|
Examples
|
|
--------
|
|
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
|
100.0
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
s1, s2 = conv_sequences(s1, s2)
|
|
sorted_s1 = _join_splitted_sequence(sorted(_split_sequence(s1)))
|
|
sorted_s2 = _join_splitted_sequence(sorted(_split_sequence(s2)))
|
|
return ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff)
|
|
|
|
|
|
def token_set_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Compares the words in the strings based on unique and common words between them
|
|
using fuzz.ratio
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/token_set_ratio.svg
|
|
|
|
Examples
|
|
--------
|
|
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
|
83.8709716796875
|
|
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
|
100.0
|
|
# Returns 100.0 if one string is a subset of the other, regardless of extra content in the longer string
|
|
>>> fuzz.token_set_ratio("fuzzy was a bear but not a dog", "fuzzy was a bear")
|
|
100.0
|
|
# Score is reduced only when there is explicit disagreement in the two strings
|
|
>>> fuzz.token_set_ratio("fuzzy was a bear but not a dog", "fuzzy was a bear but not a cat")
|
|
92.3076923076923
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
if score_cutoff is None:
|
|
score_cutoff = 0
|
|
|
|
s1, s2 = conv_sequences(s1, s2)
|
|
|
|
tokens_a = set(_split_sequence(s1))
|
|
tokens_b = set(_split_sequence(s2))
|
|
|
|
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
|
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
|
if not tokens_a or not tokens_b:
|
|
return 0
|
|
|
|
intersect = tokens_a.intersection(tokens_b)
|
|
diff_ab = tokens_a.difference(tokens_b)
|
|
diff_ba = tokens_b.difference(tokens_a)
|
|
|
|
# one sentence is part of the other one
|
|
if intersect and (not diff_ab or not diff_ba):
|
|
return 100
|
|
|
|
diff_ab_joined = _join_splitted_sequence(sorted(diff_ab))
|
|
diff_ba_joined = _join_splitted_sequence(sorted(diff_ba))
|
|
|
|
ab_len = len(diff_ab_joined)
|
|
ba_len = len(diff_ba_joined)
|
|
# todo is length sum without joining faster?
|
|
sect_len = len(_join_splitted_sequence(intersect))
|
|
|
|
# string length sect+ab <-> sect and sect+ba <-> sect
|
|
sect_ab_len = sect_len + (sect_len != 0) + ab_len
|
|
sect_ba_len = sect_len + (sect_len != 0) + ba_len
|
|
|
|
result = 0.0
|
|
cutoff_distance = ceil((sect_ab_len + sect_ba_len) * (1 - score_cutoff / 100))
|
|
dist = indel_distance(diff_ab_joined, diff_ba_joined, score_cutoff=cutoff_distance)
|
|
|
|
if dist <= cutoff_distance:
|
|
result = _norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff)
|
|
|
|
# exit early since the other ratios are 0
|
|
if not sect_len:
|
|
return result
|
|
|
|
# levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
|
# since only sect is similar in them the distance can be calculated based on
|
|
# the length difference
|
|
sect_ab_dist = (sect_len != 0) + ab_len
|
|
sect_ab_ratio = _norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff)
|
|
|
|
sect_ba_dist = (sect_len != 0) + ba_len
|
|
sect_ba_ratio = _norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff)
|
|
|
|
return max(result, sect_ab_ratio, sect_ba_ratio)
|
|
|
|
|
|
def token_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
|
|
(faster than manually executing the two functions)
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/token_ratio.svg
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
# todo write combined implementation
|
|
return max(
|
|
token_set_ratio(s1, s2, processor=None, score_cutoff=score_cutoff),
|
|
token_sort_ratio(s1, s2, processor=None, score_cutoff=score_cutoff),
|
|
)
|
|
|
|
|
|
def partial_token_sort_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
sorts the words in the strings and calculates the fuzz.partial_ratio between them
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/partial_token_sort_ratio.svg
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
s1, s2 = conv_sequences(s1, s2)
|
|
sorted_s1 = _join_splitted_sequence(sorted(_split_sequence(s1)))
|
|
sorted_s2 = _join_splitted_sequence(sorted(_split_sequence(s2)))
|
|
return partial_ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff)
|
|
|
|
|
|
def partial_token_set_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Compares the words in the strings based on unique and common words between them
|
|
using fuzz.partial_ratio
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/partial_token_set_ratio.svg
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
s1, s2 = conv_sequences(s1, s2)
|
|
|
|
tokens_a = set(_split_sequence(s1))
|
|
tokens_b = set(_split_sequence(s2))
|
|
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
|
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
|
if not tokens_a or not tokens_b:
|
|
return 0
|
|
|
|
# exit early when there is a common word in both sequences
|
|
if tokens_a.intersection(tokens_b):
|
|
return 100
|
|
|
|
diff_ab = _join_splitted_sequence(sorted(tokens_a.difference(tokens_b)))
|
|
diff_ba = _join_splitted_sequence(sorted(tokens_b.difference(tokens_a)))
|
|
return partial_ratio(diff_ab, diff_ba, score_cutoff=score_cutoff)
|
|
|
|
|
|
def partial_token_ratio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Helper method that returns the maximum of fuzz.partial_token_set_ratio and
|
|
fuzz.partial_token_sort_ratio (faster than manually executing the two functions)
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/partial_token_ratio.svg
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
if score_cutoff is None:
|
|
score_cutoff = 0
|
|
|
|
s1, s2 = conv_sequences(s1, s2)
|
|
|
|
tokens_split_a = _split_sequence(s1)
|
|
tokens_split_b = _split_sequence(s2)
|
|
tokens_a = set(tokens_split_a)
|
|
tokens_b = set(tokens_split_b)
|
|
|
|
# exit early when there is a common word in both sequences
|
|
if tokens_a.intersection(tokens_b):
|
|
return 100
|
|
|
|
diff_ab = tokens_a.difference(tokens_b)
|
|
diff_ba = tokens_b.difference(tokens_a)
|
|
|
|
result = partial_ratio(
|
|
_join_splitted_sequence(sorted(tokens_split_a)),
|
|
_join_splitted_sequence(sorted(tokens_split_b)),
|
|
score_cutoff=score_cutoff,
|
|
)
|
|
|
|
# do not calculate the same partial_ratio twice
|
|
if len(tokens_split_a) == len(diff_ab) and len(tokens_split_b) == len(diff_ba):
|
|
return result
|
|
|
|
score_cutoff = max(score_cutoff, result)
|
|
return max(
|
|
result,
|
|
partial_ratio(
|
|
_join_splitted_sequence(sorted(diff_ab)),
|
|
_join_splitted_sequence(sorted(diff_ba)),
|
|
score_cutoff=score_cutoff,
|
|
),
|
|
)
|
|
|
|
|
|
def WRatio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Calculates a weighted ratio based on the other ratio algorithms
|
|
|
|
Parameters
|
|
----------
|
|
s1 : str
|
|
First string to compare.
|
|
s2 : str
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Notes
|
|
-----
|
|
.. image:: img/WRatio.svg
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
UNBASE_SCALE = 0.95
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
|
|
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
|
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
|
if not s1 or not s2:
|
|
return 0
|
|
|
|
if score_cutoff is None:
|
|
score_cutoff = 0
|
|
|
|
len1 = len(s1)
|
|
len2 = len(s2)
|
|
len_ratio = len1 / len2 if len1 > len2 else len2 / len1
|
|
|
|
end_ratio = ratio(s1, s2, score_cutoff=score_cutoff)
|
|
if len_ratio < 1.5:
|
|
score_cutoff = max(score_cutoff, end_ratio) / UNBASE_SCALE
|
|
return max(
|
|
end_ratio,
|
|
token_ratio(s1, s2, score_cutoff=score_cutoff, processor=None) * UNBASE_SCALE,
|
|
)
|
|
|
|
PARTIAL_SCALE = 0.9 if len_ratio <= 8.0 else 0.6
|
|
score_cutoff = max(score_cutoff, end_ratio) / PARTIAL_SCALE
|
|
end_ratio = max(end_ratio, partial_ratio(s1, s2, score_cutoff=score_cutoff) * PARTIAL_SCALE)
|
|
|
|
score_cutoff = max(score_cutoff, end_ratio) / UNBASE_SCALE
|
|
return max(
|
|
end_ratio,
|
|
partial_token_ratio(s1, s2, score_cutoff=score_cutoff, processor=None) * UNBASE_SCALE * PARTIAL_SCALE,
|
|
)
|
|
|
|
|
|
def QRatio(
|
|
s1,
|
|
s2,
|
|
*,
|
|
processor=None,
|
|
score_cutoff=None,
|
|
):
|
|
"""
|
|
Calculates a quick ratio between two strings using fuzz.ratio.
|
|
|
|
Since v3.0 this behaves similar to fuzz.ratio with the exception that this
|
|
returns 0 when comparing two empty strings
|
|
|
|
Parameters
|
|
----------
|
|
s1 : Sequence[Hashable]
|
|
First string to compare.
|
|
s2 : Sequence[Hashable]
|
|
Second string to compare.
|
|
processor: callable, optional
|
|
Optional callable that is used to preprocess the strings before
|
|
comparing them. Default is None, which deactivates this behaviour.
|
|
score_cutoff : float, optional
|
|
Optional argument for a score threshold as a float between 0 and 100.
|
|
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
|
which deactivates this behaviour.
|
|
|
|
Returns
|
|
-------
|
|
similarity : float
|
|
similarity between s1 and s2 as a float between 0 and 100
|
|
|
|
Examples
|
|
--------
|
|
>>> fuzz.QRatio("this is a test", "this is a test!")
|
|
96.55171966552734
|
|
"""
|
|
setupPandas()
|
|
if is_none(s1) or is_none(s2):
|
|
return 0
|
|
|
|
if processor is not None:
|
|
s1 = processor(s1)
|
|
s2 = processor(s2)
|
|
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
|
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
|
if not s1 or not s2:
|
|
return 0
|
|
|
|
return ratio(s1, s2, score_cutoff=score_cutoff)
|
|
|
|
|
|
add_scorer_attrs(ratio, fuzz_attribute)
|
|
add_scorer_attrs(partial_ratio, fuzz_attribute)
|
|
add_scorer_attrs(token_sort_ratio, fuzz_attribute)
|
|
add_scorer_attrs(token_set_ratio, fuzz_attribute)
|
|
add_scorer_attrs(token_ratio, fuzz_attribute)
|
|
add_scorer_attrs(partial_token_sort_ratio, fuzz_attribute)
|
|
add_scorer_attrs(partial_token_set_ratio, fuzz_attribute)
|
|
add_scorer_attrs(partial_token_ratio, fuzz_attribute)
|
|
add_scorer_attrs(WRatio, fuzz_attribute)
|
|
add_scorer_attrs(QRatio, fuzz_attribute)
|