PageRenderTime 660ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/core/fpdftext/cpdf_linkextract_unittest.cpp

https://gitlab.com/kkowalczyk/pdfium
C++ | 185 lines | 155 code | 12 blank | 18 comment | 4 complexity | 866fa2c5471210fd496a387369c6d4ee MD5 | raw file
  1. // Copyright 2015 PDFium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #include "core/fpdftext/cpdf_linkextract.h"
  5. #include "testing/gtest/include/gtest/gtest.h"
  6. // Class to help test functions in CPDF_LinkExtract class.
  7. class CPDF_TestLinkExtract final : public CPDF_LinkExtract {
  8. public:
  9. CPDF_TestLinkExtract() : CPDF_LinkExtract(nullptr) {}
  10. private:
  11. // Add test cases as friends to access protected member functions.
  12. // Access CheckMailLink and CheckWebLink.
  13. FRIEND_TEST(CPDF_LinkExtractTest, CheckMailLink);
  14. FRIEND_TEST(CPDF_LinkExtractTest, CheckWebLink);
  15. };
  16. TEST(CPDF_LinkExtractTest, CheckMailLink) {
  17. CPDF_TestLinkExtract extractor;
  18. // Check cases that fail to extract valid mail link.
  19. const wchar_t* const kInvalidStrings[] = {
  20. L"",
  21. L"peter.pan", // '@' is required.
  22. L"abc@server", // Domain name needs at least one '.'.
  23. L"abc.@gmail.com", // '.' can not immediately precede '@'.
  24. L"abc@xyz&q.org", // Domain name should not contain '&'.
  25. L"abc@.xyz.org", // Domain name should not start with '.'.
  26. L"fan@g..com" // Domain name should not have consecutive '.'
  27. };
  28. for (const wchar_t* input : kInvalidStrings) {
  29. WideString text_str(input);
  30. EXPECT_FALSE(extractor.CheckMailLink(&text_str)) << input;
  31. }
  32. // Check cases that can extract valid mail link.
  33. // An array of {input_string, expected_extracted_email_address}.
  34. const wchar_t* const kValidStrings[][2] = {
  35. {L"peter@abc.d", L"peter@abc.d"},
  36. {L"red.teddy.b@abc.com", L"red.teddy.b@abc.com"},
  37. {L"abc_@gmail.com", L"abc_@gmail.com"}, // '_' is ok before '@'.
  38. {L"dummy-hi@gmail.com",
  39. L"dummy-hi@gmail.com"}, // '-' is ok in user name.
  40. {L"a..df@gmail.com", L"df@gmail.com"}, // Stop at consecutive '.'.
  41. {L".john@yahoo.com", L"john@yahoo.com"}, // Remove heading '.'.
  42. {L"abc@xyz.org?/", L"abc@xyz.org"}, // Trim ending invalid chars.
  43. {L"fan{abc@xyz.org", L"abc@xyz.org"}, // Trim beginning invalid chars.
  44. {L"fan@g.com..", L"fan@g.com"}, // Trim the ending periods.
  45. {L"CAP.cap@Gmail.Com", L"CAP.cap@Gmail.Com"}, // Keep the original case.
  46. };
  47. for (const auto& it : kValidStrings) {
  48. const wchar_t* const input = it[0];
  49. WideString text_str(input);
  50. WideString expected_str(L"mailto:");
  51. expected_str += it[1];
  52. EXPECT_TRUE(extractor.CheckMailLink(&text_str)) << input;
  53. EXPECT_STREQ(expected_str.c_str(), text_str.c_str());
  54. }
  55. }
  56. TEST(CPDF_LinkExtractTest, CheckWebLink) {
  57. CPDF_TestLinkExtract extractor;
  58. // Check cases that fail to extract valid web link.
  59. // The last few are legit web addresses that we don't handle now.
  60. const wchar_t* const kInvalidCases[] = {
  61. L"", L"http", L"www.", L"https-and-www",
  62. L"http:/abc.com", // Missing slash.
  63. L"http://((()),", // Only invalid chars in host name.
  64. L"ftp://example.com", // Ftp scheme is not supported.
  65. L"http:example.com", // Missing slashes.
  66. L"http//[example.com", // Invalid IPv6 address.
  67. L"http//[00:00:00:00:00:00", // Invalid IPv6 address.
  68. L"http//[]", // Empty IPv6 address.
  69. // Web addresses that in correct format that we don't handle.
  70. L"abc.example.com", // URL without scheme.
  71. };
  72. constexpr int32_t kDefaultValue = -42;
  73. for (const wchar_t* input : kInvalidCases) {
  74. WideString text_str(input);
  75. int32_t start_offset = kDefaultValue;
  76. int32_t count = kDefaultValue;
  77. EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count))
  78. << input;
  79. EXPECT_EQ(kDefaultValue, start_offset) << input;
  80. EXPECT_EQ(kDefaultValue, count) << input;
  81. }
  82. // Check cases that can extract valid web link.
  83. // An array of {input_string, expected_extracted_web_link}.
  84. struct ValidCase {
  85. const wchar_t* const input_string;
  86. const wchar_t* const url_extracted;
  87. const int32_t start_offset;
  88. const int32_t count;
  89. };
  90. const ValidCase kValidCases[] = {
  91. {L"http://www.example.com", L"http://www.example.com", 0,
  92. 22}, // standard URL.
  93. {L"http://www.example.com:88", L"http://www.example.com:88", 0,
  94. 25}, // URL with port number.
  95. {L"http://test@www.example.com", L"http://test@www.example.com", 0,
  96. 27}, // URL with username.
  97. {L"http://test:test@example.com", L"http://test:test@example.com", 0,
  98. 28}, // URL with username and password.
  99. {L"http://example", L"http://example", 0,
  100. 14}, // URL with short domain name.
  101. {L"http////www.server", L"http://www.server", 8,
  102. 10}, // URL starts with "www.".
  103. {L"http:/www.abc.com", L"http://www.abc.com", 6,
  104. 11}, // URL starts with "www.".
  105. {L"www.a.b.c", L"http://www.a.b.c", 0, 9}, // URL starts with "www.".
  106. {L"https://a.us", L"https://a.us", 0, 12}, // Secure http URL.
  107. {L"https://www.t.us", L"https://www.t.us", 0, 16}, // Secure http URL.
  108. {L"www.example-test.com", L"http://www.example-test.com", 0,
  109. 20}, // '-' in host is ok.
  110. {L"www.example.com,", L"http://www.example.com", 0,
  111. 15}, // Trim ending invalid chars.
  112. {L"www.example.com;(", L"http://www.example.com", 0,
  113. 15}, // Trim ending invalid chars.
  114. {L"test:www.abc.com", L"http://www.abc.com", 5,
  115. 11}, // Trim chars before URL.
  116. {L"(http://www.abc.com)", L"http://www.abc.com", 1,
  117. 18}, // Trim external brackets.
  118. {L"0(http://www.abc.com)0", L"http://www.abc.com", 2,
  119. 18}, // Trim chars outside brackets as well.
  120. {L"0(www.abc.com)0", L"http://www.abc.com", 2,
  121. 11}, // Links without http should also have brackets trimmed.
  122. {L"http://www.abc.com)0", L"http://www.abc.com)0", 0,
  123. 20}, // Do not trim brackets that were not opened.
  124. {L"{(<http://www.abc.com>)}", L"http://www.abc.com", 3,
  125. 18}, // Trim chars with multiple levels of brackets.
  126. {L"[http://www.abc.com/z(1)]", L"http://www.abc.com/z(1)", 1,
  127. 23}, // Brackets opened inside the URL should not be trimmed.
  128. {L"(http://www.abc.com/z(1))", L"http://www.abc.com/z(1)", 1,
  129. 23}, // Brackets opened inside the URL should not be trimmed.
  130. {L"\"http://www.abc.com\"", L"http://www.abc.com", 1,
  131. 18}, // External quotes can also be escaped
  132. {L"www.g.com..", L"http://www.g.com..", 0, 11}, // Leave ending periods.
  133. // Web links can contain IP addresses too.
  134. {L"http://192.168.0.1", L"http://192.168.0.1", 0, 18}, // IPv4 address.
  135. {L"http://192.168.0.1:80", L"http://192.168.0.1:80", 0,
  136. 21}, // IPv4 address with port.
  137. {L"http://[aa::00:bb::00:cc:00]", L"http://[aa::00:bb::00:cc:00]", 0,
  138. 28}, // IPv6 reference.
  139. {L"http://[aa::00:bb::00:cc:00]:12", L"http://[aa::00:bb::00:cc:00]:12",
  140. 0, 31}, // IPv6 reference with port.
  141. {L"http://[aa]:12", L"http://[aa]:12", 0,
  142. 14}, // Not validate IP address.
  143. {L"http://[aa]:12abc", L"http://[aa]:12", 0,
  144. 14}, // Trim for IPv6 address.
  145. {L"http://[aa]:", L"http://[aa]", 0, 11}, // Trim for IPv6 address.
  146. // Path and query parts can be anything.
  147. {L"www.abc.com/#%%^&&*(", L"http://www.abc.com/#%%^&&*(", 0, 20},
  148. {L"www.a.com/#a=@?q=rr&r=y", L"http://www.a.com/#a=@?q=rr&r=y", 0, 23},
  149. {L"http://a.com/1/2/3/4\5\6", L"http://a.com/1/2/3/4\5\6", 0, 22},
  150. {L"http://www.example.com/foo;bar", L"http://www.example.com/foo;bar", 0,
  151. 30},
  152. // Invalid chars inside host name are ok as we don't validate them.
  153. {L"http://ex[am]ple", L"http://ex[am]ple", 0, 16},
  154. {L"http://:example.com", L"http://:example.com", 0, 19},
  155. {L"http://((())/path?", L"http://((())/path?", 0, 18},
  156. {L"http:////abc.server", L"http:////abc.server", 0, 19},
  157. // Non-ASCII chars are not validated either.
  158. {L"www.测试.net", L"http://www.测试.net", 0, 10},
  159. {L"www.测试。net。", L"http://www.测试。net。", 0, 11},
  160. {L"www.测试.net;", L"http://www.测试.net;", 0, 11},
  161. };
  162. for (const auto& it : kValidCases) {
  163. const wchar_t* const input = it.input_string;
  164. WideString text_str(input);
  165. int32_t start_offset = kDefaultValue;
  166. int32_t count = kDefaultValue;
  167. EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count))
  168. << input;
  169. EXPECT_STREQ(it.url_extracted, text_str.c_str());
  170. EXPECT_EQ(it.start_offset, start_offset) << input;
  171. EXPECT_EQ(it.count, count) << input;
  172. }
  173. }