PageRenderTime 2ms CodeModel.GetById 80ms app.highlight 132ms RepoModel.GetById 1ms app.codeStats 1ms

/src/main/java/com/searchcode/app/util/SearchCodeLib.java

https://github.com/boyter/searchcode-server
Java | 595 lines | 392 code | 104 blank | 99 comment | 105 complexity | ffd5b0faf83ec702f2a77910de03f171 MD5 | raw file
  1/*
  2 * Copyright (c) 2016 Boyter Online Services
  3 *
  4 * Use of this software is governed by the Fair Source License included
  5 * in the LICENSE.TXT file, but will be eventually open under GNU General Public License Version 3
  6 * see the README.md for when this clause will take effect
  7 *
  8 * Version 1.3.15
  9 */
 10
 11package com.searchcode.app.util;
 12
 13import com.google.common.collect.Iterables;
 14import com.searchcode.app.config.Values;
 15import com.searchcode.app.dao.Data;
 16import com.searchcode.app.dto.*;
 17import com.searchcode.app.service.Singleton;
 18import org.apache.commons.lang3.StringUtils;
 19import org.apache.lucene.queryparser.classic.QueryParser;
 20
 21import java.util.*;
 22import java.util.regex.Matcher;
 23import java.util.regex.Pattern;
 24
 25public class SearchCodeLib {
 26
 27    private final ISpellingCorrector spellingCorrector;
 28    private final FileClassifier fileClassifier;
 29    private final int MINIFIED_LENGTH;
 30
 31    private final int MAX_SPLIT_LENGTH = 100_000;
 32    private final Pattern MULTIPLE_UPPERCASE = Pattern.compile("[A-Z]{2,}");
 33    private final boolean GUESS_BINARY = Boolean.parseBoolean(Properties.getProperties().getProperty(Values.GUESS_BINARY, Values.DEFAULT_GUESS_BINARY));
 34    private final boolean AND_MATCH = Boolean.parseBoolean(com.searchcode.app.util.Properties.getProperties().getProperty(Values.AND_MATCH, Values.DEFAULT_AND_MATCH));
 35
 36    public String[] WHITE_LIST = Properties.getProperties().getProperty(Values.BINARY_WHITE_LIST, Values.DEFAULT_BINARY_WHITE_LIST).split(",");
 37    public String[] BLACK_LIST = Properties.getProperties().getProperty(Values.BINARY_BLACK_LIST, Values.DEFAULT_BINARY_BLACK_LIST).split(",");
 38
 39    public SearchCodeLib() {
 40        this(Singleton.getSpellingCorrector(), new FileClassifier(), Singleton.getData(), Singleton.getHelpers());
 41    }
 42
 43    public SearchCodeLib(ISpellingCorrector spellingCorrector, FileClassifier fileClassifier, Data data, Helpers helpers) {
 44        this.spellingCorrector = spellingCorrector;
 45        this.fileClassifier = fileClassifier;
 46
 47        int minifiedLength = helpers.tryParseInt(data.getDataByName(Values.MINIFIEDLENGTH, Values.DEFAULTMINIFIEDLENGTH), Values.DEFAULTMINIFIEDLENGTH);
 48        this.MINIFIED_LENGTH = minifiedLength <= 0 ? Integer.parseInt(Values.DEFAULTMINIFIEDLENGTH) : minifiedLength;
 49    }
 50
 51    /**
 52     * Split "intelligently" on anything over 7 characters long
 53     * if it only contains [a-zA-Z]
 54     * split based on uppercase String[] r = s.split("(?=\\p{Upper})");
 55     * add those as additional words to index on
 56     * so that things like RegexIndexer becomes Regex Indexer
 57     * split the string by spaces
 58     * look for anything over 7 characters long
 59     * if its only [a-zA-Z]
 60     * split by uppercase
 61     */
 62    public String splitKeywords(String contents, boolean runningJoin) {
 63        if (contents == null) {
 64            return Values.EMPTYSTRING;
 65        }
 66
 67        StringBuilder indexContents = new StringBuilder();
 68
 69        contents = contents.replaceAll("[^a-zA-Z0-9]", " ");
 70
 71        // Performance improvement hack
 72        if (contents.length() > this.MAX_SPLIT_LENGTH) {
 73
 74            // Add AAA to ensure we dont split the last word if it was cut off
 75            contents = contents.substring(0, MAX_SPLIT_LENGTH) + "AAA";
 76        }
 77
 78        for (String splitContents : contents.split(" ")) {
 79            if (splitContents.length() >= 7) {
 80                Matcher m = MULTIPLE_UPPERCASE.matcher(splitContents);
 81
 82                if (!m.find()) {
 83                    String[] splitStrings = splitContents.split("(?=\\p{Upper})");
 84
 85                    if (splitStrings.length > 1) {
 86                        indexContents.append(" ").append(StringUtils.join(splitStrings, " "));
 87
 88                        if (runningJoin) {
 89                            StringBuilder running = new StringBuilder();
 90                            for (String split : splitStrings) {
 91                                running.append(split);
 92                                indexContents.append(" ").append(running.toString());
 93                            }
 94                        }
 95                    }
 96                }
 97            }
 98        }
 99
100        return indexContents.toString();
101    }
102
103    public String findInterestingKeywords(String contents) {
104        if (contents == null) {
105            return Values.EMPTYSTRING;
106        }
107
108        StringBuilder indexContents = new StringBuilder();
109
110        // Performance improvement hack
111        if (contents.length() > this.MAX_SPLIT_LENGTH) {
112            // Add AAA to ensure we dont split the last word if it was cut off
113            contents = contents.substring(0, MAX_SPLIT_LENGTH) + "AAA";
114        }
115
116        // Finds versions with words at the front, eg linux2.7.4
117        Matcher m = Pattern.compile("[a-z]+(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)").matcher(contents);
118
119        while (m.find()) {
120            indexContents.append(" ");
121            indexContents.append(m.group());
122        }
123
124        return indexContents.toString();
125    }
126
127    public String findInterestingCharacters(String contents) {
128        if (contents == null) {
129            return Values.EMPTYSTRING;
130        }
131
132        String replaced = contents.replaceAll("\\w", "");
133
134        StringBuilder stringBuilder = new StringBuilder();
135        for (char c : replaced.toCharArray()) {
136            stringBuilder.append(c).append(" ");
137        }
138
139        return stringBuilder.toString();
140    }
141
142    /**
143     * List of languages to ignore displaying the cost for
144     * TODO move this into the database so it is configurable
145     */
146    public boolean languageCostIgnore(String languagename) {
147
148        boolean ignore;
149
150        switch (languagename) {
151            case "Unknown":
152            case "Text":
153            case "JSON":
154            case "Markdown":
155            case "INI File":
156            case "ReStructuredText":
157            case "Configuration":
158                ignore = true;
159                break;
160            default:
161                ignore = false;
162                break;
163        }
164
165        return ignore;
166    }
167
168    /**
169     * Adds a string into the spelling corrector.
170     * TODO move this into the spelling corrector class itself
171     */
172    public void addToSpellingCorrector(String contents) {
173        if (contents == null) {
174            return;
175        }
176
177        // Limit to reduce performance impacts
178        if (contents.length() > this.MAX_SPLIT_LENGTH) {
179            contents = contents.substring(0, MAX_SPLIT_LENGTH);
180        }
181
182        List<String> splitString = Arrays.asList(contents.replaceAll("[^a-zA-Z0-9]", " ").toLowerCase().split(" "));
183
184        // Only the first 10000 to avoid causing too much slow-down
185        if (splitString.size() > 10_000) {
186            splitString = splitString.subList(0, 10_000);
187        }
188
189        for (String s : splitString) {
190            if (s.length() >= 3) {
191                this.spellingCorrector.putWord(s);
192            }
193        }
194    }
195
196    /**
197     * Determine if a List<String> which is used to represent a code file contains a code file that is
198     * suspected to be minified. This is for the purposes of excluding it from the index.
199     */
200    public boolean isMinified(List<String> codeLines, String fileName) {
201
202        var lowerFileName = fileName.toLowerCase();
203
204        for (var extension : this.WHITE_LIST) {
205            if (lowerFileName.endsWith("." + extension)) {
206                return false;
207            }
208        }
209
210        var average = codeLines.stream().map(x -> x.trim().replace(" ", "")).mapToInt(String::length).average();
211        if (average.isPresent() && average.getAsDouble() > this.MINIFIED_LENGTH) {
212            return true;
213        }
214
215        return false;
216    }
217
218    /**
219     * Determine if a List<String> which is used to represent a code file contains a code file that is
220     * suspected to be ascii or non ascii. This is for the purposes of excluding it from the index.
221     */
222    public BinaryFinding isBinary(List<String> codeLines, String fileName) {
223        if (codeLines.isEmpty()) {
224            return new BinaryFinding(true, "file is empty");
225        }
226
227        var lowerFileName = fileName.toLowerCase();
228        // Check against user set whitelist
229        for (var extension : this.WHITE_LIST) {
230            if (lowerFileName.endsWith("." + extension)) {
231                return new BinaryFinding(false, "appears in extension whitelist");
232            }
233        }
234
235        // Check against user set blacklist
236        for (var extension : this.BLACK_LIST) {
237            if (lowerFileName.endsWith("." + extension) || lowerFileName.equals(extension)) {
238                return new BinaryFinding(true, "appears in extension blacklist");
239            }
240        }
241
242        // Check if whitelisted extension IE what we know about
243        var database = fileClassifier.getDatabase();
244        for (var key : database.keySet()) {
245            var fileClassifierResult = database.get(key);
246            for (var extension : fileClassifierResult.extensions) {
247                if (lowerFileName.endsWith("." + extension)) {
248                    return new BinaryFinding(false, "appears in internal extension whitelist");
249                }
250            }
251        }
252
253        // If we aren't meant to guess then assume it isn't binary
254        if (!this.GUESS_BINARY) {
255            return new BinaryFinding(false, Values.EMPTYSTRING);
256        }
257
258        // GNU Grep, ripgrep and git all take the approach that if a file as a nul
259        // byte in it then it is binary. If its good enough for those giants
260        // its good enough for us.
261        for (int i = 0; i < codeLines.size(); i++) {
262            var line = codeLines.get(i);
263            for (int j = 0; j < line.length(); j++) {
264                if (line.charAt(j) == 0) {
265                    return new BinaryFinding(true, "nul byte found");
266                }
267            }
268        }
269
270        return new BinaryFinding(false, Values.EMPTYSTRING);
271    }
272
273    /**
274     * Determines who owns a piece of code weighted by time based on current second (IE time now)
275     * NB if a commit is very close to this time it will always win
276     */
277    public String codeOwner(List<CodeOwner> codeOwners) {
278        long currentUnix = System.currentTimeMillis() / 1_000L;
279
280        double best = 0;
281        String owner = "Unknown";
282
283        for (CodeOwner codeOwner : codeOwners) {
284            double age = (currentUnix - codeOwner.getMostRecentUnixCommitTimestamp()) / 60 / 60;
285            double calc = codeOwner.getNoLines() / Math.pow((age), 1.8);
286
287            if (calc > best) {
288                best = calc;
289                owner = codeOwner.getName();
290            }
291        }
292
293        return owner;
294    }
295
296    /**
297     * Cleans and formats the code into something that can be indexed by lucene while supporting searches such as
298     * i++ matching for(int i=0;i<100;i++;){
299     */
300    public String codeCleanPipeline(String originalContents) {
301        if (originalContents == null) {
302            return Values.EMPTYSTRING;
303        }
304
305        String modifiedContents = originalContents;
306
307        StringBuilder indexContents = new StringBuilder();
308
309        // Change how we replace strings
310        // Modify the contents to match strings correctly
311        char[] firstReplacements = {'<', '>', ')', '(', '[', ']', '|', '=', ',', ':'};
312        for (char c : firstReplacements) {
313            modifiedContents = modifiedContents.replace(c, ' ');
314        }
315        indexContents.append(" ").append(modifiedContents);
316
317        char[] otherReplacements = {'.'};
318        for (char c : otherReplacements) {
319            modifiedContents = modifiedContents.replace(c, ' ');
320        }
321        indexContents.append(" ").append(modifiedContents);
322
323        char[] secondReplacements = {';', '{', '}', '/'};
324        for (char c : secondReplacements) {
325            modifiedContents = modifiedContents.replace(c, ' ');
326        }
327        indexContents.append(" ").append(modifiedContents);
328
329        char[] forthReplacements = {'"', '\''};
330        for (char c : forthReplacements) {
331            modifiedContents = modifiedContents.replace(c, ' ');
332        }
333        indexContents.append(" ").append(modifiedContents);
334
335        // Now do it for other characters
336        char[] replacements = {'\'', '"', '.', ';', '=', '(', ')', '[', ']', '_', ';', '@', '#'};
337        for (char c : replacements) {
338            modifiedContents = modifiedContents.replace(c, ' ');
339        }
340        indexContents.append(" ").append(modifiedContents);
341
342        char[] thirdReplacements = {'-'};
343        for (char c : thirdReplacements) {
344            modifiedContents = modifiedContents.replace(c, ' ');
345        }
346        indexContents.append(" ").append(modifiedContents);
347
348        // Issue 188 Fixes
349        modifiedContents = originalContents;
350        char[] replacements188 = {'(', ')', '<', '>'};
351        for (char c : replacements188) {
352            modifiedContents = modifiedContents.replace(c, ' ');
353        }
354        indexContents.append(" ").append(modifiedContents);
355
356
357        return indexContents.toString();
358    }
359
360    /**
361     * Parse the query and escape it as per Lucene but without affecting search operators such as AND OR and NOT
362     */
363    public String formatQueryString(String query) {
364        if (this.AND_MATCH) {
365            return this.formatQueryStringAndDefault(query);
366        }
367
368        return this.formatQueryStringOrDefault(query);
369    }
370
371    public String formatQueryStringAndDefault(String query) {
372        String[] split = query.trim().split("\\s+");
373
374        List<String> stringList = new ArrayList<>();
375
376        String and = " AND ";
377        String or = " OR ";
378        String not = " NOT ";
379
380        for (String term : split) {
381            switch (term) {
382                case "AND":
383                    if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(and)) {
384                        stringList.add(and);
385                    }
386                    break;
387                case "OR":
388                    if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(or)) {
389                        stringList.add(or);
390                    }
391                    break;
392                case "NOT":
393                    if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(not)) {
394                        stringList.add(not);
395                    }
396                    break;
397                default:
398                    if (Iterables.getLast(stringList, null) == null ||
399                            Iterables.getLast(stringList).equals(and) ||
400                            Iterables.getLast(stringList).equals(or) ||
401                            Iterables.getLast(stringList).equals(not)) {
402                        stringList.add(" " + QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")").replace("\\*", "*") + " ");
403                    } else {
404                        stringList.add(and + QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")").replace("\\*", "*") + " ");
405                    }
406                    break;
407            }
408        }
409        String temp = StringUtils.join(stringList, " ");
410        return temp.trim();
411    }
412
413    public String formatQueryStringOrDefault(String query) {
414        String[] split = query.trim().split("\\s+");
415
416        StringBuilder sb = new StringBuilder();
417
418        String and = " AND ";
419        String or = " OR ";
420        String not = " NOT ";
421
422        for (String term : split) {
423            switch (term) {
424                case "AND":
425                    sb.append(and);
426                    break;
427                case "OR":
428                    sb.append(or);
429                    break;
430                case "NOT":
431                    sb.append(not);
432                    break;
433                default:
434                    sb.append(" ");
435                    sb.append(QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")").replace("\\*", "*"));
436                    sb.append(" ");
437                    break;
438            }
439        }
440
441        return sb.toString().trim();
442    }
443
444    /**
445     * Given a query attempts to create alternative queries that should be looser and as such produce more matches
446     * or give results where none may exist for the current query.
447     */
448    public List<String> generateAltQueries(String query) {
449        List<String> altQueries = new ArrayList<>();
450        query = query.trim().replaceAll(" +", " ");
451        String altquery = query.replaceAll("[^A-Za-z0-9 ]", " ").trim().replaceAll(" +", " ");
452
453        if (!altquery.equals(query) && !Values.EMPTYSTRING.equals(altquery)) {
454            altQueries.add(altquery);
455        }
456
457        altquery = this.splitKeywords(query, false).trim();
458        if (!altquery.equals("") && !altquery.equals(query) && !altQueries.contains(altquery)) {
459            altQueries.add(altquery);
460        }
461
462        StringBuilder stringBuilder = new StringBuilder();
463        for (String word : query.replaceAll(" +", " ").split(" ")) {
464            if (!word.trim().equals("AND") && !word.trim().equals("OR") && !word.trim().equals("NOT")) {
465                stringBuilder.append(" ").append(this.spellingCorrector.correct(word));
466            }
467        }
468        altquery = stringBuilder.toString().trim();
469
470        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
471            altQueries.add(altquery);
472        }
473
474        altquery = query.replace(" AND ", " OR ");
475        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
476            altQueries.add(altquery);
477        }
478
479        altquery = query.replace(" AND ", " ");
480        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
481            altQueries.add(altquery);
482        }
483
484        altquery = query.replace(" NOT ", " ");
485        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
486            altQueries.add(altquery);
487        }
488
489        return altQueries;
490    }
491
492
493    public String generateBusBlurb(ProjectStats projectStats) {
494
495        StringBuilder stringBuilder = new StringBuilder();
496        stringBuilder.append("In this repository ").append(projectStats.getRepoFacetOwner().size());
497
498        if (projectStats.getRepoFacetOwner().size() == 1) {
499            stringBuilder.append(" committer has contributed to ");
500        } else {
501            stringBuilder.append(" committers have contributed to ");
502        }
503
504        if (projectStats.getTotalFiles() == 1) {
505            stringBuilder.append(projectStats.getTotalFiles()).append(" file. ");
506        } else {
507            stringBuilder.append(projectStats.getTotalFiles()).append(" files. ");
508        }
509
510        List<CodeFacetLanguage> codeFacetLanguages = projectStats.getCodeFacetLanguages();
511
512        if (codeFacetLanguages.size() == 1) {
513            stringBuilder.append("The most important language in this repository is ").append(codeFacetLanguages.get(0).getLanguageName()).append(". ");
514        } else {
515            stringBuilder.append("The most important languages in this repository are ");
516
517            if (!codeFacetLanguages.isEmpty()) {
518                if (codeFacetLanguages.size() > 3) {
519                    codeFacetLanguages = codeFacetLanguages.subList(0, 3);
520                }
521                for (int i = 0; i < codeFacetLanguages.size() - 1; i++) {
522                    stringBuilder.append(codeFacetLanguages.get(i).getLanguageName()).append(", ");
523                }
524                stringBuilder.append(" and ").append(codeFacetLanguages.get(codeFacetLanguages.size() - 1).getLanguageName()).append(". ");
525            }
526        }
527
528        if (!projectStats.getRepoFacetOwner().isEmpty()) {
529            if (projectStats.getRepoFacetOwner().size() < 5) {
530                stringBuilder.append("The project has a low bus factor of ").append(projectStats.getRepoFacetOwner().size());
531                stringBuilder.append(" and will be in trouble if ").append(projectStats.getRepoFacetOwner().get(0).getOwner()).append(" is hit by a bus. ");
532            } else if (projectStats.getRepoFacetOwner().size() < 15) {
533                stringBuilder.append("The project has bus factor of ").append(projectStats.getRepoFacetOwner().size()).append(". ");
534            } else {
535                stringBuilder.append("The project has high bus factor of ").append(projectStats.getRepoFacetOwner().size()).append(". ");
536            }
537        }
538
539        List<String> highKnowledge = new ArrayList<>();
540        double sumAverageFilesWorked = 0;
541        for (CodeFacetOwner codeFacetOwner : projectStats.getRepoFacetOwner()) {
542            double currentAverage = (double) codeFacetOwner.getCount() / (double) projectStats.getTotalFiles();
543            sumAverageFilesWorked += currentAverage;
544
545            if (currentAverage > 0.1) {
546                highKnowledge.add(codeFacetOwner.getOwner());
547            }
548        }
549
550        int averageFilesWorked = (int) (sumAverageFilesWorked / projectStats.getRepoFacetOwner().size() * 100);
551
552        stringBuilder.append("The average person who commits this project has ownership of ");
553        stringBuilder.append(averageFilesWorked).append("% of files. ");
554
555        if (!highKnowledge.isEmpty()) {
556            stringBuilder.append("The project relies on the following people; ");
557            stringBuilder.append(StringUtils.join(highKnowledge, ", ")).append(". ");
558        }
559
560        return stringBuilder.toString().replace(",  and", " and");
561    }
562
563//    /**
564//     * Currently not used but meant to replicate the searchcode.com hash which is used to identify duplicate files
565//     * even when they have a few characters or lines missing. It should in these cases produce identical hashes.
566//     */
567//    public String hash(String contents) {
568//        int hashLength = 20;
569//
570//        if (contents.length() == 0) {
571//            return Strings.padStart("", hashLength, '0');
572//        }
573//
574//        String allowedCharacters = "BCDFGHIJKLMNOPQRSUVWXYZbcdfghijklmnopqrsuvwxyz1234567890";
575//
576//        // remove all spaces
577//        Joiner joiner = Joiner.on("").skipNulls();
578//        String toHash = joiner.join(Splitter.on(' ')
579//                            .trimResults()
580//                            .omitEmptyStrings()
581//                            .split(contents));
582//
583//        // remove all non acceptable characters
584//        for(int i=0; i< toHash.length(); i++) {
585//            char c = toHash.charAt(i);
586//
587//            if (allowedCharacters.indexOf(c) != -1) {
588//                // allowed so keep it
589//            }
590//        }
591//
592//        return "";
593//    }
594}
595