PageRenderTime 52ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/src/main/java/com/searchcode/app/util/SearchCodeLib.java

https://github.com/boyter/searchcode-server
Java | 595 lines | 392 code | 104 blank | 99 comment | 105 complexity | ffd5b0faf83ec702f2a77910de03f171 MD5 | raw file
  1. /*
  2. * Copyright (c) 2016 Boyter Online Services
  3. *
  4. * Use of this software is governed by the Fair Source License included
  5. * in the LICENSE.TXT file, but will be eventually open under GNU General Public License Version 3
  6. * see the README.md for when this clause will take effect
  7. *
  8. * Version 1.3.15
  9. */
  10. package com.searchcode.app.util;
  11. import com.google.common.collect.Iterables;
  12. import com.searchcode.app.config.Values;
  13. import com.searchcode.app.dao.Data;
  14. import com.searchcode.app.dto.*;
  15. import com.searchcode.app.service.Singleton;
  16. import org.apache.commons.lang3.StringUtils;
  17. import org.apache.lucene.queryparser.classic.QueryParser;
  18. import java.util.*;
  19. import java.util.regex.Matcher;
  20. import java.util.regex.Pattern;
  21. public class SearchCodeLib {
  22. private final ISpellingCorrector spellingCorrector;
  23. private final FileClassifier fileClassifier;
  24. private final int MINIFIED_LENGTH;
  25. private final int MAX_SPLIT_LENGTH = 100_000;
  26. private final Pattern MULTIPLE_UPPERCASE = Pattern.compile("[A-Z]{2,}");
  27. private final boolean GUESS_BINARY = Boolean.parseBoolean(Properties.getProperties().getProperty(Values.GUESS_BINARY, Values.DEFAULT_GUESS_BINARY));
  28. private final boolean AND_MATCH = Boolean.parseBoolean(com.searchcode.app.util.Properties.getProperties().getProperty(Values.AND_MATCH, Values.DEFAULT_AND_MATCH));
  29. public String[] WHITE_LIST = Properties.getProperties().getProperty(Values.BINARY_WHITE_LIST, Values.DEFAULT_BINARY_WHITE_LIST).split(",");
  30. public String[] BLACK_LIST = Properties.getProperties().getProperty(Values.BINARY_BLACK_LIST, Values.DEFAULT_BINARY_BLACK_LIST).split(",");
  31. public SearchCodeLib() {
  32. this(Singleton.getSpellingCorrector(), new FileClassifier(), Singleton.getData(), Singleton.getHelpers());
  33. }
  34. public SearchCodeLib(ISpellingCorrector spellingCorrector, FileClassifier fileClassifier, Data data, Helpers helpers) {
  35. this.spellingCorrector = spellingCorrector;
  36. this.fileClassifier = fileClassifier;
  37. int minifiedLength = helpers.tryParseInt(data.getDataByName(Values.MINIFIEDLENGTH, Values.DEFAULTMINIFIEDLENGTH), Values.DEFAULTMINIFIEDLENGTH);
  38. this.MINIFIED_LENGTH = minifiedLength <= 0 ? Integer.parseInt(Values.DEFAULTMINIFIEDLENGTH) : minifiedLength;
  39. }
  40. /**
  41. * Split "intelligently" on anything over 7 characters long
  42. * if it only contains [a-zA-Z]
  43. * split based on uppercase String[] r = s.split("(?=\\p{Upper})");
  44. * add those as additional words to index on
  45. * so that things like RegexIndexer becomes Regex Indexer
  46. * split the string by spaces
  47. * look for anything over 7 characters long
  48. * if its only [a-zA-Z]
  49. * split by uppercase
  50. */
  51. public String splitKeywords(String contents, boolean runningJoin) {
  52. if (contents == null) {
  53. return Values.EMPTYSTRING;
  54. }
  55. StringBuilder indexContents = new StringBuilder();
  56. contents = contents.replaceAll("[^a-zA-Z0-9]", " ");
  57. // Performance improvement hack
  58. if (contents.length() > this.MAX_SPLIT_LENGTH) {
  59. // Add AAA to ensure we dont split the last word if it was cut off
  60. contents = contents.substring(0, MAX_SPLIT_LENGTH) + "AAA";
  61. }
  62. for (String splitContents : contents.split(" ")) {
  63. if (splitContents.length() >= 7) {
  64. Matcher m = MULTIPLE_UPPERCASE.matcher(splitContents);
  65. if (!m.find()) {
  66. String[] splitStrings = splitContents.split("(?=\\p{Upper})");
  67. if (splitStrings.length > 1) {
  68. indexContents.append(" ").append(StringUtils.join(splitStrings, " "));
  69. if (runningJoin) {
  70. StringBuilder running = new StringBuilder();
  71. for (String split : splitStrings) {
  72. running.append(split);
  73. indexContents.append(" ").append(running.toString());
  74. }
  75. }
  76. }
  77. }
  78. }
  79. }
  80. return indexContents.toString();
  81. }
  82. public String findInterestingKeywords(String contents) {
  83. if (contents == null) {
  84. return Values.EMPTYSTRING;
  85. }
  86. StringBuilder indexContents = new StringBuilder();
  87. // Performance improvement hack
  88. if (contents.length() > this.MAX_SPLIT_LENGTH) {
  89. // Add AAA to ensure we dont split the last word if it was cut off
  90. contents = contents.substring(0, MAX_SPLIT_LENGTH) + "AAA";
  91. }
  92. // Finds versions with words at the front, eg linux2.7.4
  93. Matcher m = Pattern.compile("[a-z]+(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)").matcher(contents);
  94. while (m.find()) {
  95. indexContents.append(" ");
  96. indexContents.append(m.group());
  97. }
  98. return indexContents.toString();
  99. }
  100. public String findInterestingCharacters(String contents) {
  101. if (contents == null) {
  102. return Values.EMPTYSTRING;
  103. }
  104. String replaced = contents.replaceAll("\\w", "");
  105. StringBuilder stringBuilder = new StringBuilder();
  106. for (char c : replaced.toCharArray()) {
  107. stringBuilder.append(c).append(" ");
  108. }
  109. return stringBuilder.toString();
  110. }
  111. /**
  112. * List of languages to ignore displaying the cost for
  113. * TODO move this into the database so it is configurable
  114. */
  115. public boolean languageCostIgnore(String languagename) {
  116. boolean ignore;
  117. switch (languagename) {
  118. case "Unknown":
  119. case "Text":
  120. case "JSON":
  121. case "Markdown":
  122. case "INI File":
  123. case "ReStructuredText":
  124. case "Configuration":
  125. ignore = true;
  126. break;
  127. default:
  128. ignore = false;
  129. break;
  130. }
  131. return ignore;
  132. }
  133. /**
  134. * Adds a string into the spelling corrector.
  135. * TODO move this into the spelling corrector class itself
  136. */
  137. public void addToSpellingCorrector(String contents) {
  138. if (contents == null) {
  139. return;
  140. }
  141. // Limit to reduce performance impacts
  142. if (contents.length() > this.MAX_SPLIT_LENGTH) {
  143. contents = contents.substring(0, MAX_SPLIT_LENGTH);
  144. }
  145. List<String> splitString = Arrays.asList(contents.replaceAll("[^a-zA-Z0-9]", " ").toLowerCase().split(" "));
  146. // Only the first 10000 to avoid causing too much slow-down
  147. if (splitString.size() > 10_000) {
  148. splitString = splitString.subList(0, 10_000);
  149. }
  150. for (String s : splitString) {
  151. if (s.length() >= 3) {
  152. this.spellingCorrector.putWord(s);
  153. }
  154. }
  155. }
  156. /**
  157. * Determine if a List<String> which is used to represent a code file contains a code file that is
  158. * suspected to be minified. This is for the purposes of excluding it from the index.
  159. */
  160. public boolean isMinified(List<String> codeLines, String fileName) {
  161. var lowerFileName = fileName.toLowerCase();
  162. for (var extension : this.WHITE_LIST) {
  163. if (lowerFileName.endsWith("." + extension)) {
  164. return false;
  165. }
  166. }
  167. var average = codeLines.stream().map(x -> x.trim().replace(" ", "")).mapToInt(String::length).average();
  168. if (average.isPresent() && average.getAsDouble() > this.MINIFIED_LENGTH) {
  169. return true;
  170. }
  171. return false;
  172. }
  173. /**
  174. * Determine if a List<String> which is used to represent a code file contains a code file that is
  175. * suspected to be ascii or non ascii. This is for the purposes of excluding it from the index.
  176. */
  177. public BinaryFinding isBinary(List<String> codeLines, String fileName) {
  178. if (codeLines.isEmpty()) {
  179. return new BinaryFinding(true, "file is empty");
  180. }
  181. var lowerFileName = fileName.toLowerCase();
  182. // Check against user set whitelist
  183. for (var extension : this.WHITE_LIST) {
  184. if (lowerFileName.endsWith("." + extension)) {
  185. return new BinaryFinding(false, "appears in extension whitelist");
  186. }
  187. }
  188. // Check against user set blacklist
  189. for (var extension : this.BLACK_LIST) {
  190. if (lowerFileName.endsWith("." + extension) || lowerFileName.equals(extension)) {
  191. return new BinaryFinding(true, "appears in extension blacklist");
  192. }
  193. }
  194. // Check if whitelisted extension IE what we know about
  195. var database = fileClassifier.getDatabase();
  196. for (var key : database.keySet()) {
  197. var fileClassifierResult = database.get(key);
  198. for (var extension : fileClassifierResult.extensions) {
  199. if (lowerFileName.endsWith("." + extension)) {
  200. return new BinaryFinding(false, "appears in internal extension whitelist");
  201. }
  202. }
  203. }
  204. // If we aren't meant to guess then assume it isn't binary
  205. if (!this.GUESS_BINARY) {
  206. return new BinaryFinding(false, Values.EMPTYSTRING);
  207. }
  208. // GNU Grep, ripgrep and git all take the approach that if a file as a nul
  209. // byte in it then it is binary. If its good enough for those giants
  210. // its good enough for us.
  211. for (int i = 0; i < codeLines.size(); i++) {
  212. var line = codeLines.get(i);
  213. for (int j = 0; j < line.length(); j++) {
  214. if (line.charAt(j) == 0) {
  215. return new BinaryFinding(true, "nul byte found");
  216. }
  217. }
  218. }
  219. return new BinaryFinding(false, Values.EMPTYSTRING);
  220. }
  221. /**
  222. * Determines who owns a piece of code weighted by time based on current second (IE time now)
  223. * NB if a commit is very close to this time it will always win
  224. */
  225. public String codeOwner(List<CodeOwner> codeOwners) {
  226. long currentUnix = System.currentTimeMillis() / 1_000L;
  227. double best = 0;
  228. String owner = "Unknown";
  229. for (CodeOwner codeOwner : codeOwners) {
  230. double age = (currentUnix - codeOwner.getMostRecentUnixCommitTimestamp()) / 60 / 60;
  231. double calc = codeOwner.getNoLines() / Math.pow((age), 1.8);
  232. if (calc > best) {
  233. best = calc;
  234. owner = codeOwner.getName();
  235. }
  236. }
  237. return owner;
  238. }
  239. /**
  240. * Cleans and formats the code into something that can be indexed by lucene while supporting searches such as
  241. * i++ matching for(int i=0;i<100;i++;){
  242. */
  243. public String codeCleanPipeline(String originalContents) {
  244. if (originalContents == null) {
  245. return Values.EMPTYSTRING;
  246. }
  247. String modifiedContents = originalContents;
  248. StringBuilder indexContents = new StringBuilder();
  249. // Change how we replace strings
  250. // Modify the contents to match strings correctly
  251. char[] firstReplacements = {'<', '>', ')', '(', '[', ']', '|', '=', ',', ':'};
  252. for (char c : firstReplacements) {
  253. modifiedContents = modifiedContents.replace(c, ' ');
  254. }
  255. indexContents.append(" ").append(modifiedContents);
  256. char[] otherReplacements = {'.'};
  257. for (char c : otherReplacements) {
  258. modifiedContents = modifiedContents.replace(c, ' ');
  259. }
  260. indexContents.append(" ").append(modifiedContents);
  261. char[] secondReplacements = {';', '{', '}', '/'};
  262. for (char c : secondReplacements) {
  263. modifiedContents = modifiedContents.replace(c, ' ');
  264. }
  265. indexContents.append(" ").append(modifiedContents);
  266. char[] forthReplacements = {'"', '\''};
  267. for (char c : forthReplacements) {
  268. modifiedContents = modifiedContents.replace(c, ' ');
  269. }
  270. indexContents.append(" ").append(modifiedContents);
  271. // Now do it for other characters
  272. char[] replacements = {'\'', '"', '.', ';', '=', '(', ')', '[', ']', '_', ';', '@', '#'};
  273. for (char c : replacements) {
  274. modifiedContents = modifiedContents.replace(c, ' ');
  275. }
  276. indexContents.append(" ").append(modifiedContents);
  277. char[] thirdReplacements = {'-'};
  278. for (char c : thirdReplacements) {
  279. modifiedContents = modifiedContents.replace(c, ' ');
  280. }
  281. indexContents.append(" ").append(modifiedContents);
  282. // Issue 188 Fixes
  283. modifiedContents = originalContents;
  284. char[] replacements188 = {'(', ')', '<', '>'};
  285. for (char c : replacements188) {
  286. modifiedContents = modifiedContents.replace(c, ' ');
  287. }
  288. indexContents.append(" ").append(modifiedContents);
  289. return indexContents.toString();
  290. }
  291. /**
  292. * Parse the query and escape it as per Lucene but without affecting search operators such as AND OR and NOT
  293. */
  294. public String formatQueryString(String query) {
  295. if (this.AND_MATCH) {
  296. return this.formatQueryStringAndDefault(query);
  297. }
  298. return this.formatQueryStringOrDefault(query);
  299. }
  300. public String formatQueryStringAndDefault(String query) {
  301. String[] split = query.trim().split("\\s+");
  302. List<String> stringList = new ArrayList<>();
  303. String and = " AND ";
  304. String or = " OR ";
  305. String not = " NOT ";
  306. for (String term : split) {
  307. switch (term) {
  308. case "AND":
  309. if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(and)) {
  310. stringList.add(and);
  311. }
  312. break;
  313. case "OR":
  314. if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(or)) {
  315. stringList.add(or);
  316. }
  317. break;
  318. case "NOT":
  319. if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(not)) {
  320. stringList.add(not);
  321. }
  322. break;
  323. default:
  324. if (Iterables.getLast(stringList, null) == null ||
  325. Iterables.getLast(stringList).equals(and) ||
  326. Iterables.getLast(stringList).equals(or) ||
  327. Iterables.getLast(stringList).equals(not)) {
  328. stringList.add(" " + QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")").replace("\\*", "*") + " ");
  329. } else {
  330. stringList.add(and + QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")").replace("\\*", "*") + " ");
  331. }
  332. break;
  333. }
  334. }
  335. String temp = StringUtils.join(stringList, " ");
  336. return temp.trim();
  337. }
  338. public String formatQueryStringOrDefault(String query) {
  339. String[] split = query.trim().split("\\s+");
  340. StringBuilder sb = new StringBuilder();
  341. String and = " AND ";
  342. String or = " OR ";
  343. String not = " NOT ";
  344. for (String term : split) {
  345. switch (term) {
  346. case "AND":
  347. sb.append(and);
  348. break;
  349. case "OR":
  350. sb.append(or);
  351. break;
  352. case "NOT":
  353. sb.append(not);
  354. break;
  355. default:
  356. sb.append(" ");
  357. sb.append(QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")").replace("\\*", "*"));
  358. sb.append(" ");
  359. break;
  360. }
  361. }
  362. return sb.toString().trim();
  363. }
  364. /**
  365. * Given a query attempts to create alternative queries that should be looser and as such produce more matches
  366. * or give results where none may exist for the current query.
  367. */
  368. public List<String> generateAltQueries(String query) {
  369. List<String> altQueries = new ArrayList<>();
  370. query = query.trim().replaceAll(" +", " ");
  371. String altquery = query.replaceAll("[^A-Za-z0-9 ]", " ").trim().replaceAll(" +", " ");
  372. if (!altquery.equals(query) && !Values.EMPTYSTRING.equals(altquery)) {
  373. altQueries.add(altquery);
  374. }
  375. altquery = this.splitKeywords(query, false).trim();
  376. if (!altquery.equals("") && !altquery.equals(query) && !altQueries.contains(altquery)) {
  377. altQueries.add(altquery);
  378. }
  379. StringBuilder stringBuilder = new StringBuilder();
  380. for (String word : query.replaceAll(" +", " ").split(" ")) {
  381. if (!word.trim().equals("AND") && !word.trim().equals("OR") && !word.trim().equals("NOT")) {
  382. stringBuilder.append(" ").append(this.spellingCorrector.correct(word));
  383. }
  384. }
  385. altquery = stringBuilder.toString().trim();
  386. if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
  387. altQueries.add(altquery);
  388. }
  389. altquery = query.replace(" AND ", " OR ");
  390. if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
  391. altQueries.add(altquery);
  392. }
  393. altquery = query.replace(" AND ", " ");
  394. if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
  395. altQueries.add(altquery);
  396. }
  397. altquery = query.replace(" NOT ", " ");
  398. if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
  399. altQueries.add(altquery);
  400. }
  401. return altQueries;
  402. }
  403. public String generateBusBlurb(ProjectStats projectStats) {
  404. StringBuilder stringBuilder = new StringBuilder();
  405. stringBuilder.append("In this repository ").append(projectStats.getRepoFacetOwner().size());
  406. if (projectStats.getRepoFacetOwner().size() == 1) {
  407. stringBuilder.append(" committer has contributed to ");
  408. } else {
  409. stringBuilder.append(" committers have contributed to ");
  410. }
  411. if (projectStats.getTotalFiles() == 1) {
  412. stringBuilder.append(projectStats.getTotalFiles()).append(" file. ");
  413. } else {
  414. stringBuilder.append(projectStats.getTotalFiles()).append(" files. ");
  415. }
  416. List<CodeFacetLanguage> codeFacetLanguages = projectStats.getCodeFacetLanguages();
  417. if (codeFacetLanguages.size() == 1) {
  418. stringBuilder.append("The most important language in this repository is ").append(codeFacetLanguages.get(0).getLanguageName()).append(". ");
  419. } else {
  420. stringBuilder.append("The most important languages in this repository are ");
  421. if (!codeFacetLanguages.isEmpty()) {
  422. if (codeFacetLanguages.size() > 3) {
  423. codeFacetLanguages = codeFacetLanguages.subList(0, 3);
  424. }
  425. for (int i = 0; i < codeFacetLanguages.size() - 1; i++) {
  426. stringBuilder.append(codeFacetLanguages.get(i).getLanguageName()).append(", ");
  427. }
  428. stringBuilder.append(" and ").append(codeFacetLanguages.get(codeFacetLanguages.size() - 1).getLanguageName()).append(". ");
  429. }
  430. }
  431. if (!projectStats.getRepoFacetOwner().isEmpty()) {
  432. if (projectStats.getRepoFacetOwner().size() < 5) {
  433. stringBuilder.append("The project has a low bus factor of ").append(projectStats.getRepoFacetOwner().size());
  434. stringBuilder.append(" and will be in trouble if ").append(projectStats.getRepoFacetOwner().get(0).getOwner()).append(" is hit by a bus. ");
  435. } else if (projectStats.getRepoFacetOwner().size() < 15) {
  436. stringBuilder.append("The project has bus factor of ").append(projectStats.getRepoFacetOwner().size()).append(". ");
  437. } else {
  438. stringBuilder.append("The project has high bus factor of ").append(projectStats.getRepoFacetOwner().size()).append(". ");
  439. }
  440. }
  441. List<String> highKnowledge = new ArrayList<>();
  442. double sumAverageFilesWorked = 0;
  443. for (CodeFacetOwner codeFacetOwner : projectStats.getRepoFacetOwner()) {
  444. double currentAverage = (double) codeFacetOwner.getCount() / (double) projectStats.getTotalFiles();
  445. sumAverageFilesWorked += currentAverage;
  446. if (currentAverage > 0.1) {
  447. highKnowledge.add(codeFacetOwner.getOwner());
  448. }
  449. }
  450. int averageFilesWorked = (int) (sumAverageFilesWorked / projectStats.getRepoFacetOwner().size() * 100);
  451. stringBuilder.append("The average person who commits this project has ownership of ");
  452. stringBuilder.append(averageFilesWorked).append("% of files. ");
  453. if (!highKnowledge.isEmpty()) {
  454. stringBuilder.append("The project relies on the following people; ");
  455. stringBuilder.append(StringUtils.join(highKnowledge, ", ")).append(". ");
  456. }
  457. return stringBuilder.toString().replace(", and", " and");
  458. }
  459. // /**
  460. // * Currently not used but meant to replicate the searchcode.com hash which is used to identify duplicate files
  461. // * even when they have a few characters or lines missing. It should in these cases produce identical hashes.
  462. // */
  463. // public String hash(String contents) {
  464. // int hashLength = 20;
  465. //
  466. // if (contents.length() == 0) {
  467. // return Strings.padStart("", hashLength, '0');
  468. // }
  469. //
  470. // String allowedCharacters = "BCDFGHIJKLMNOPQRSUVWXYZbcdfghijklmnopqrsuvwxyz1234567890";
  471. //
  472. // // remove all spaces
  473. // Joiner joiner = Joiner.on("").skipNulls();
  474. // String toHash = joiner.join(Splitter.on(' ')
  475. // .trimResults()
  476. // .omitEmptyStrings()
  477. // .split(contents));
  478. //
  479. // // remove all non acceptable characters
  480. // for(int i=0; i< toHash.length(); i++) {
  481. // char c = toHash.charAt(i);
  482. //
  483. // if (allowedCharacters.indexOf(c) != -1) {
  484. // // allowed so keep it
  485. // }
  486. // }
  487. //
  488. // return "";
  489. // }
  490. }