PageRenderTime 65ms CodeModel.GetById 29ms RepoModel.GetById 1ms app.codeStats 0ms

/src/main/java/ac/simons/autolinker/EmailAddressAutoLinker.java

https://github.com/michael-simons/java-autolinker
Java | 139 lines | 65 code | 17 blank | 57 comment | 6 complexity | a5e0f0015338622f3e9af0a5728ba088 MD5 | raw file
  1. /*
  2. * Copyright 2014-2018 michael-simons.eu.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package ac.simons.autolinker;
  17. import java.nio.charset.StandardCharsets;
  18. import java.util.ArrayList;
  19. import java.util.List;
  20. import java.util.regex.Matcher;
  21. import java.util.regex.Pattern;
  22. import org.jsoup.nodes.Element;
  23. import org.jsoup.nodes.Node;
  24. import org.jsoup.nodes.TextNode;
  25. import org.jsoup.parser.Tag;
  26. /**
  27. * Turns valid ASCII email adresses into anchor text. The label can be
  28. * obsfuscated, the email address can be encoded to hex.
  29. *
  30. * @author Michael J. Simons, 2014-12-27
  31. */
  32. public final class EmailAddressAutoLinker implements AutoLinker {
  33. /**
  34. * Regex according to http://www.w3.org/Protocols/rfc822/, Originally
  35. * written by Cal Henderson
  36. * (http://iamcal.com/publish/articles/php/parsing_email/), Translated to
  37. * Ruby by Tim Fletcher, with changes suggested by Dan Kubb. Translated to
  38. * Java by Michael J. Simons
  39. */
  40. private static final String VALID_EMAIL_ADDRESS_REGEX = "(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x22(?:[^\\x0d\\x22\\x5c\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x22)(?:\\x2e(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x22(?:[^\\x0d\\x22\\x5c\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x22))*\\x40(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x5b(?:[^\\x0d\\x5b-\\x5d\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x5d)(?:\\x2e(?:[^\\x00-\\x20\\x22\\x28\\x29\\x2c\\x2e\\x3a-\\x3c\\x3e\\x40\\x5b-\\x5d\\x7f-\\xff]+|\\x5b(?:[^\\x0d\\x5b-\\x5d\\x80-\\xff]|\\x5c[\\x00-\\x7f])*\\x5d))*";
  41. public static final Pattern VALID_EMAIL_ADDRESS = Pattern.compile(String.format("\\A%s\\z", VALID_EMAIL_ADDRESS_REGEX));
  42. public static final Pattern VALID_EMAIL_ADRESS_ML = Pattern.compile(String.format("(?m)(?<![^\\s])%s", VALID_EMAIL_ADDRESS_REGEX));
  43. public static final Pattern AT_SIGNS = Pattern.compile("[@\uFF20\\x40]");
  44. /**
  45. * A flag if the addresses in the mailto: protocoll should be hex-encoded
  46. */
  47. private final boolean hexEncodeEmailAddress;
  48. /**
  49. * A flag if the labels should be obfuscated
  50. */
  51. private final boolean obfuscateEmailAddress;
  52. /**
  53. * Instantiates a new email address autolinker.
  54. *
  55. * @param hexEncodeEmailAddress Should mailto: Addresses be hex-encoded?
  56. * @param obfuscateEmailAddress Should labels be obfuscated?
  57. */
  58. public EmailAddressAutoLinker(
  59. final boolean hexEncodeEmailAddress,
  60. final boolean obfuscateEmailAddress
  61. ) {
  62. this.hexEncodeEmailAddress = hexEncodeEmailAddress;
  63. this.obfuscateEmailAddress = obfuscateEmailAddress;
  64. }
  65. @Override
  66. public List<Node> createLinks(final TextNode textNode) {
  67. final List<Node> rv = new ArrayList<>();
  68. int start = 0;
  69. final String nodeText = textNode.getWholeText();
  70. final String baseUri = textNode.baseUri();
  71. final Matcher matcher = VALID_EMAIL_ADRESS_ML.matcher(nodeText);
  72. while (matcher.find()) {
  73. final String emailAddress = matcher.group();
  74. if (!(new String(emailAddress.getBytes(), StandardCharsets.US_ASCII)).equals(emailAddress)) {
  75. continue;
  76. }
  77. final String textBefore = nodeText.substring(start, matcher.start());
  78. if (!textBefore.isEmpty()) {
  79. rv.add(new TextNode(textBefore));
  80. }
  81. final Element newAnchor = new Element(Tag.valueOf("a"), baseUri);
  82. newAnchor.attr("href", String.format("%s%s", "mailto:", hexEncodeEmailAddress ? hexEncodeEmailAddress(emailAddress) : emailAddress));
  83. newAnchor.appendChild(new TextNode(obfuscateEmailAddress ? obfuscateEmailAddress(emailAddress) : emailAddress));
  84. rv.add(newAnchor);
  85. start = matcher.end();
  86. }
  87. // Add a new textnode for everything after
  88. final String textAfter = nodeText.substring(start);
  89. if (!textAfter.isEmpty()) {
  90. rv.add(new TextNode(textAfter));
  91. }
  92. return rv;
  93. }
  94. /**
  95. * Obfuscates an email address. @ will be replaced throught " [AT] " and .
  96. * through " [DOT] ". The email address is lowercased before processing.
  97. *
  98. * @param emailAddress The email address to obfuscate
  99. * @return An obfuscated email address
  100. */
  101. public String obfuscateEmailAddress(final String emailAddress) {
  102. return AT_SIGNS.matcher(emailAddress.toLowerCase()).replaceAll(" [AT] ").replaceAll("\\.", " [DOT] ");
  103. }
  104. /**
  105. * Hex encodes an email addess, leaving the '@' intact. Browsers are able to
  106. * decode this and maybe it's stops spammers from using emails like that.
  107. * The email address is lowercased before processing.
  108. *
  109. * @param emailAddress The email address that should be encoded to
  110. * hexadecimal
  111. * @return An hexadecimal encoded email adresse
  112. */
  113. public String hexEncodeEmailAddress(final String emailAddress) {
  114. final String emailAddressLc = emailAddress.toLowerCase();
  115. final StringBuilder rv = new StringBuilder();
  116. for (int i = 0; i < emailAddressLc.length(); ++i) {
  117. char c = emailAddressLc.charAt(i);
  118. rv.append(c == '@' ? c : String.format("%%%x", (int) c));
  119. }
  120. return rv.toString();
  121. }
  122. }