To: vim_dev@googlegroups.com Subject: Patch 8.2.1665 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 8.2.1665 Problem: Cannot do fuzzy string matching. Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes #6932) Files: runtime/doc/eval.txt, runtime/doc/usr_41.txt, src/evalfunc.c, src/proto/search.pro, src/search.c, src/testdir/test_functions.vim *** ../vim-8.2.1664/runtime/doc/eval.txt 2020-09-06 21:47:39.323041533 +0200 --- runtime/doc/eval.txt 2020-09-11 22:20:52.847578582 +0200 *************** *** 2627,2632 **** --- 2641,2647 ---- matchdelete({id} [, {win}]) Number delete match identified by {id} matchend({expr}, {pat} [, {start} [, {count}]]) Number position where {pat} ends in {expr} + matchfuzzy({list}, {str}) List fuzzy match {str} in {list} matchlist({expr}, {pat} [, {start} [, {count}]]) List match and submatches of {pat} in {expr} matchstr({expr}, {pat} [, {start} [, {count}]]) *************** *** 7260,7265 **** --- 7310,7338 ---- Can also be used as a |method|: > GetText()->matchend('word') + + matchfuzzy({list}, {str}) *matchfuzzy()* + Returns a list with all the strings in {list} that fuzzy + match {str}. The strings in the returned list are sorted + based on the matching score. {str} is treated as a literal + string and regular expression matching is NOT supported. + The maximum supported {str} length is 256. + + If there are no matching strings or there is an error, then an + empty list is returned. If length of {str} is greater than + 256, then returns an empty list. + + Example: > + :echo matchfuzzy(["clay", "crow"], "cay") + < results in ["clay"]. > + :echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl") + < results in a list of buffer names fuzzy matching "ndl". > + :echo v:oldfiles->matchfuzzy("test") + < results in a list of file names fuzzy matching "test". > + :let l = readfile("buffer.c")->matchfuzzy("str") + < results in a list of lines in "buffer.c" fuzzy matching "str". + + matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()* Same as |match()|, but return a |List|. The first item in the list is the matched string, same as what matchstr() would *** ../vim-8.2.1664/runtime/doc/usr_41.txt 2020-09-04 16:35:06.421571299 +0200 --- runtime/doc/usr_41.txt 2020-09-11 22:15:07.264666300 +0200 *************** *** 595,600 **** --- 603,609 ---- charclass() class of a character match() position where a pattern matches in a string matchend() position where a pattern match ends in a string + matchfuzzy() fuzzy matches a string in a list of strings matchstr() match of a pattern in a string matchstrpos() match and positions of a pattern in a string matchlist() like matchstr() and also return submatches *** ../vim-8.2.1664/src/evalfunc.c 2020-09-06 21:47:39.323041533 +0200 --- src/evalfunc.c 2020-09-11 22:15:07.264666300 +0200 *************** *** 750,755 **** --- 750,756 ---- {"matcharg", 1, 1, FEARG_1, ret_list_string, f_matcharg}, {"matchdelete", 1, 2, FEARG_1, ret_number, f_matchdelete}, {"matchend", 2, 4, FEARG_1, ret_number, f_matchend}, + {"matchfuzzy", 2, 2, FEARG_1, ret_list_string, f_matchfuzzy}, {"matchlist", 2, 4, FEARG_1, ret_list_string, f_matchlist}, {"matchstr", 2, 4, FEARG_1, ret_string, f_matchstr}, {"matchstrpos", 2, 4, FEARG_1, ret_list_any, f_matchstrpos}, *** ../vim-8.2.1664/src/proto/search.pro 2020-06-01 17:28:31.511939716 +0200 --- src/proto/search.pro 2020-09-11 22:15:07.264666300 +0200 *************** *** 36,39 **** --- 36,40 ---- spat_T *get_spat(int idx); int get_spat_last_idx(void); void f_searchcount(typval_T *argvars, typval_T *rettv); + void f_matchfuzzy(typval_T *argvars, typval_T *rettv); /* vim: set ft=c : */ *** ../vim-8.2.1664/src/search.c 2020-09-05 23:15:56.409267096 +0200 --- src/search.c 2020-09-11 22:15:07.264666300 +0200 *************** *** 4165,4168 **** --- 4165,4508 ---- the_end: restore_last_search_pattern(); } + + /* + * Fuzzy string matching + * + * Ported from the lib_fts library authored by Forrest Smith. + * https://github.com/forrestthewoods/lib_fts/tree/master/code + * + * Blog describing the algorithm: + * https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/ + * + * Each matching string is assigned a score. The following factors are checked: + * Matched letter + * Unmatched letter + * Consecutively matched letters + * Proximity to start + * Letter following a separator (space, underscore) + * Uppercase letter following lowercase (aka CamelCase) + * + * Matched letters are good. Unmatched letters are bad. Matching near the start + * is good. Matching the first letter in the middle of a phrase is good. + * Matching the uppercase letters in camel case entries is good. + * + * The score assigned for each factor is explained below. + * File paths are different from file names. File extensions may be ignorable. + * Single words care about consecutive matches but not separators or camel + * case. + * Score starts at 0 + * Matched letter: +0 points + * Unmatched letter: -1 point + * Consecutive match bonus: +5 points + * Separator bonus: +10 points + * Camel case bonus: +10 points + * Unmatched leading letter: -3 points (max: -9) + * + * There is some nuance to this. Scores don’t have an intrinsic meaning. The + * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a + * lower minimum score due to unmatched letter penalty. Longer search patterns + * have a higher maximum score due to match bonuses. + * + * Separator and camel case bonus is worth a LOT. Consecutive matches are worth + * quite a bit. + * + * There is a penalty if you DON’T match the first three letters. Which + * effectively rewards matching near the start. However there’s no difference + * in matching between the middle and end. + * + * There is not an explicit bonus for an exact match. Unmatched letters receive + * a penalty. So shorter strings and closer matches are worth more. + */ + typedef struct + { + listitem_T *item; + int score; + } fuzzyItem_T; + + static int + fuzzy_match_recursive( + char_u *fuzpat, + char_u *str, + int *outScore, + char_u *strBegin, + char_u *srcMatches, + char_u *matches, + int maxMatches, + int nextMatch, + int *recursionCount, + int recursionLimit) + { + // Recursion params + int recursiveMatch = FALSE; + char_u bestRecursiveMatches[256]; + int bestRecursiveScore = 0; + int first_match; + int matched; + + // Count recursions + ++*recursionCount; + if (*recursionCount >= recursionLimit) + return FALSE; + + // Detect end of strings + if (*fuzpat == '\0' || *str == '\0') + return FALSE; + + // Loop through fuzpat and str looking for a match + first_match = TRUE; + while (*fuzpat != '\0' && *str != '\0') + { + // Found match + if (vim_tolower(*fuzpat) == vim_tolower(*str)) + { + char_u recursiveMatches[256]; + int recursiveScore = 0; + + // Supplied matches buffer was too short + if (nextMatch >= maxMatches) + return FALSE; + + // "Copy-on-Write" srcMatches into matches + if (first_match && srcMatches) + { + memcpy(matches, srcMatches, nextMatch); + first_match = FALSE; + } + + // Recursive call that "skips" this match + if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore, + strBegin, matches, recursiveMatches, + sizeof(recursiveMatches), nextMatch, recursionCount, + recursionLimit)) + { + // Pick best recursive score + if (!recursiveMatch || recursiveScore > bestRecursiveScore) + { + memcpy(bestRecursiveMatches, recursiveMatches, 256); + bestRecursiveScore = recursiveScore; + } + recursiveMatch = TRUE; + } + + // Advance + matches[nextMatch++] = (char_u)(str - strBegin); + ++fuzpat; + } + ++str; + } + + // Determine if full fuzpat was matched + matched = *fuzpat == '\0' ? TRUE : FALSE; + + // Calculate score + if (matched) + { + // bonus for adjacent matches + int sequential_bonus = 15; + // bonus if match occurs after a separator + int separator_bonus = 30; + // bonus if match is uppercase and prev is lower + int camel_bonus = 30; + // bonus if the first letter is matched + int first_letter_bonus = 15; + // penalty applied for every letter in str before the first match + int leading_letter_penalty = -5; + // maximum penalty for leading letters + int max_leading_letter_penalty = -15; + // penalty for every letter that doesn't matter + int unmatched_letter_penalty = -1; + int penalty; + int unmatched; + int i; + + // Iterate str to end + while (*str != '\0') + ++str; + + // Initialize score + *outScore = 100; + + // Apply leading letter penalty + penalty = leading_letter_penalty * matches[0]; + if (penalty < max_leading_letter_penalty) + penalty = max_leading_letter_penalty; + *outScore += penalty; + + // Apply unmatched penalty + unmatched = (int)(str - strBegin) - nextMatch; + *outScore += unmatched_letter_penalty * unmatched; + + // Apply ordering bonuses + for (i = 0; i < nextMatch; ++i) + { + char_u currIdx = matches[i]; + + if (i > 0) + { + char_u prevIdx = matches[i - 1]; + + // Sequential + if (currIdx == (prevIdx + 1)) + *outScore += sequential_bonus; + } + + // Check for bonuses based on neighbor character value + if (currIdx > 0) + { + // Camel case + char_u neighbor = strBegin[currIdx - 1]; + char_u curr = strBegin[currIdx]; + int neighborSeparator; + + if (islower(neighbor) && isupper(curr)) + *outScore += camel_bonus; + + // Separator + neighborSeparator = neighbor == '_' || neighbor == ' '; + if (neighborSeparator) + *outScore += separator_bonus; + } + else + { + // First letter + *outScore += first_letter_bonus; + } + } + } + + // Return best result + if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) + { + // Recursive score is better than "this" + memcpy(matches, bestRecursiveMatches, maxMatches); + *outScore = bestRecursiveScore; + return TRUE; + } + else if (matched) + return TRUE; // "this" score is better than recursive + + return FALSE; // no match + } + + /* + * fuzzy_match() + * + * Performs exhaustive search via recursion to find all possible matches and + * match with highest score. + * Scores values have no intrinsic meaning. Possible score range is not + * normalized and varies with pattern. + * Recursion is limited internally (default=10) to prevent degenerate cases + * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"). + * Uses char_u for match indices. Therefore patterns are limited to 256 + * characters. + * + * Returns TRUE if fuzpat is found AND calculates a score. + */ + static int + fuzzy_match(char_u *str, char_u *fuzpat, int *outScore) + { + char_u matches[256]; + int recursionCount = 0; + int recursionLimit = 10; + + *outScore = 0; + + return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches, + sizeof(matches), 0, &recursionCount, recursionLimit); + } + + /* + * Sort the fuzzy matches in the descending order of the match score. + */ + static int + fuzzy_item_compare(const void *s1, const void *s2) + { + int v1 = ((fuzzyItem_T *)s1)->score; + int v2 = ((fuzzyItem_T *)s2)->score; + + return v1 == v2 ? 0 : v1 > v2 ? -1 : 1; + } + + /* + * Fuzzy search the string 'str' in 'strlist' and return the matching strings + * in 'fmatchlist'. + */ + static void + match_fuzzy(list_T *strlist, char_u *str, list_T *fmatchlist) + { + long len; + fuzzyItem_T *ptrs; + listitem_T *li; + long i = 0; + int found_match = FALSE; + + len = list_len(strlist); + if (len == 0) + return; + + ptrs = ALLOC_MULT(fuzzyItem_T, len); + if (ptrs == NULL) + return; + + // For all the string items in strlist, get the fuzzy matching score + FOR_ALL_LIST_ITEMS(strlist, li) + { + int score; + + ptrs[i].item = li; + ptrs[i].score = -9999; + // ignore non-string items in the list + if (li->li_tv.v_type == VAR_STRING && li->li_tv.vval.v_string != NULL) + if (fuzzy_match(li->li_tv.vval.v_string, str, &score)) + { + ptrs[i].score = score; + found_match = TRUE; + } + ++i; + } + + if (found_match) + { + // Sort the list by the descending order of the match score + qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T), + fuzzy_item_compare); + + // Copy the matching strings with 'score != -9999' to the return list + for (i = 0; i < len; i++) + { + if (ptrs[i].score == -9999) + break; + list_append_string(fmatchlist, ptrs[i].item->li_tv.vval.v_string, + -1); + } + } + + vim_free(ptrs); + } + + /* + * "matchfuzzy()" function + */ + void + f_matchfuzzy(typval_T *argvars, typval_T *rettv) + { + if (argvars[0].v_type != VAR_LIST) + { + emsg(_(e_listreq)); + return; + } + if (argvars[0].vval.v_list == NULL) + return; + if (argvars[1].v_type != VAR_STRING + || argvars[1].vval.v_string == NULL) + { + semsg(_(e_invarg2), tv_get_string(&argvars[1])); + return; + } + if (rettv_list_alloc(rettv) == OK) + match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]), + rettv->vval.v_list); + } + #endif *** ../vim-8.2.1664/src/testdir/test_functions.vim 2020-09-04 21:18:40.484161926 +0200 --- src/testdir/test_functions.vim 2020-09-11 22:15:07.264666300 +0200 *************** *** 2554,2557 **** --- 2554,2581 ---- call assert_fails('call browsedir("open", [])', 'E730:') endfunc + " Test for matchfuzzy() + func Test_matchfuzzy() + call assert_fails('call matchfuzzy(10, "abc")', 'E714:') + call assert_fails('call matchfuzzy(["abc"], [])', 'E730:') + call assert_equal([], matchfuzzy([], 'abc')) + call assert_equal([], matchfuzzy(['abc'], '')) + call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac')) + call assert_equal([], matchfuzzy([10, 20], 'ac')) + call assert_equal(['abc'], matchfuzzy(['abc'], 'abc')) + call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra')) + call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa')) + call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one')) + call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo')) + call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo')) + call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa')) + call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257))) + + %bw! + eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)}) + let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl') + call assert_equal(1, len(l)) + call assert_match('needle', l[0]) + endfunc + " vim: shiftwidth=2 sts=2 expandtab *** ../vim-8.2.1664/src/version.c 2020-09-11 22:10:17.965597366 +0200 --- src/version.c 2020-09-11 22:23:52.163020448 +0200 *************** *** 752,753 **** --- 752,755 ---- { /* Add new patch number below this line */ + /**/ + 1665, /**/ -- SIGFUN -- signature too funny (core dumped) /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ an exciting new programming language -- http://www.Zimbu.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///