PageRenderTime 26ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/SnuDom/src/autolink.cpp

https://github.com/hippiehunter/Baconography
C++ | 403 lines | 261 code | 84 blank | 58 comment | 145 complexity | d79cd233edd8b8fbe6e03ed354aeaf06 MD5 | raw file
  1. /*
  2. * Copyright (c) 2011, Vicent Marti
  3. *
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #include "buffer.h"
  17. #include "autolink.h"
  18. #include <string.h>
  19. #include <stdlib.h>
  20. #include <stdio.h>
  21. #include <ctype.h>
  22. #if defined(_WIN32)
  23. #define strncasecmp _strnicmp
  24. #endif
  25. int
  26. sd_autolink_issafe(const uint8_t *link, size_t link_len)
  27. {
  28. static const size_t valid_uris_count = 14;
  29. static const char *valid_uris[] = {
  30. "http://", "https://", "ftp://", "mailto://",
  31. "/", "git://", "steam://", "irc://", "news://", "mumble://",
  32. "ssh://", "ircs://", "ts3server://", "#"
  33. };
  34. size_t i;
  35. for (i = 0; i < valid_uris_count; ++i) {
  36. size_t len = strlen(valid_uris[i]);
  37. if (link_len > len &&
  38. strncasecmp((char *)link, valid_uris[i], len) == 0 &&
  39. (isalnum(link[len]) || link[len] == '#' || link[len] == '/' || link[len] == '?'))
  40. return 1;
  41. }
  42. return 0;
  43. }
  44. static size_t
  45. autolink_delim(uint8_t *data, size_t link_end, size_t offset, size_t size)
  46. {
  47. uint8_t cclose, copen = 0;
  48. size_t i;
  49. for (i = 0; i < link_end; ++i)
  50. if (data[i] == '<') {
  51. link_end = i;
  52. break;
  53. }
  54. while (link_end > 0) {
  55. if (strchr("?!.,", data[link_end - 1]) != NULL)
  56. link_end--;
  57. else if (data[link_end - 1] == ';') {
  58. size_t new_end = link_end - 2;
  59. while (new_end > 0 && isalpha(data[new_end]))
  60. new_end--;
  61. if (new_end < link_end - 2 && data[new_end] == '&')
  62. link_end = new_end;
  63. else
  64. link_end--;
  65. }
  66. else break;
  67. }
  68. if (link_end == 0)
  69. return 0;
  70. cclose = data[link_end - 1];
  71. switch (cclose) {
  72. case '"': copen = '"'; break;
  73. case '\'': copen = '\''; break;
  74. case ')': copen = '('; break;
  75. case ']': copen = '['; break;
  76. case '}': copen = '{'; break;
  77. }
  78. if (copen != 0) {
  79. size_t closing = 0;
  80. size_t opening = 0;
  81. size_t i = 0;
  82. /* Try to close the final punctuation sign in this same line;
  83. * if we managed to close it outside of the URL, that means that it's
  84. * not part of the URL. If it closes inside the URL, that means it
  85. * is part of the URL.
  86. *
  87. * Examples:
  88. *
  89. * foo http://www.pokemon.com/Pikachu_(Electric) bar
  90. * => http://www.pokemon.com/Pikachu_(Electric)
  91. *
  92. * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
  93. * => http://www.pokemon.com/Pikachu_(Electric)
  94. *
  95. * foo http://www.pokemon.com/Pikachu_(Electric)) bar
  96. * => http://www.pokemon.com/Pikachu_(Electric))
  97. *
  98. * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
  99. * => foo http://www.pokemon.com/Pikachu_(Electric)
  100. */
  101. while (i < link_end) {
  102. if (data[i] == copen)
  103. opening++;
  104. else if (data[i] == cclose)
  105. closing++;
  106. i++;
  107. }
  108. if (closing != opening)
  109. link_end--;
  110. }
  111. return link_end;
  112. }
  113. static size_t
  114. check_domain(uint8_t *data, size_t size, int allow_short)
  115. {
  116. size_t i, np = 0;
  117. if (!isalnum(data[0]))
  118. return 0;
  119. for (i = 1; i < size - 1; ++i) {
  120. if (data[i] == '.') np++;
  121. else if (!isalnum(data[i]) && data[i] != '-') break;
  122. }
  123. if (allow_short) {
  124. /* We don't need a valid domain in the strict sense (with
  125. * least one dot; so just make sure it's composed of valid
  126. * domain characters and return the length of the the valid
  127. * sequence. */
  128. return i;
  129. } else {
  130. /* a valid domain needs to have at least a dot.
  131. * that's as far as we get */
  132. return np ? i : 0;
  133. }
  134. }
  135. size_t
  136. sd_autolink__www(
  137. void* opaque, void* (*allocate)(void *opaque, size_t size),
  138. size_t *rewind_p,
  139. struct buf *link,
  140. uint8_t *data,
  141. size_t offset,
  142. size_t size,
  143. unsigned int flags)
  144. {
  145. size_t link_end;
  146. if (offset > 0 && !ispunct(data[-1]) && !isspace(data[-1]))
  147. return 0;
  148. if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
  149. return 0;
  150. link_end = check_domain(data, size, 0);
  151. if (link_end == 0)
  152. return 0;
  153. while (link_end < size && !isspace(data[link_end]))
  154. link_end++;
  155. link_end = autolink_delim(data, link_end, offset, size);
  156. if (link_end == 0)
  157. return 0;
  158. bufput(opaque, allocate, link, data, link_end);
  159. *rewind_p = 0;
  160. return (int)link_end;
  161. }
  162. size_t
  163. sd_autolink__email(
  164. void* opaque, void* (*allocate)(void *opaque, size_t size),
  165. size_t *rewind_p,
  166. struct buf *link,
  167. uint8_t *data,
  168. size_t offset,
  169. size_t size,
  170. unsigned int flags)
  171. {
  172. size_t link_end, rewind;
  173. int nb = 0, np = 0;
  174. for (rewind = 0; rewind < offset; ++rewind) {
  175. uint8_t c = data[-((ptrdiff_t)rewind) - 1];
  176. if (isalnum(c))
  177. continue;
  178. if (strchr(".+-_", c) != NULL)
  179. continue;
  180. break;
  181. }
  182. if (rewind == 0)
  183. return 0;
  184. for (link_end = 0; link_end < size; ++link_end) {
  185. uint8_t c = data[link_end];
  186. if (isalnum(c))
  187. continue;
  188. if (c == '@')
  189. nb++;
  190. else if (c == '.' && link_end < size - 1)
  191. np++;
  192. else if (c != '-' && c != '_')
  193. break;
  194. }
  195. if (link_end < 2 || nb != 1 || np == 0)
  196. return 0;
  197. link_end = autolink_delim(data, link_end, offset, size);
  198. if (link_end == 0)
  199. return 0;
  200. bufput(opaque, allocate, link, data - rewind, link_end + rewind);
  201. *rewind_p = rewind;
  202. return link_end;
  203. }
  204. size_t
  205. sd_autolink__url(
  206. void* opaque, void* (*allocate)(void *opaque, size_t size),
  207. size_t *rewind_p,
  208. struct buf *link,
  209. uint8_t *data,
  210. size_t offset,
  211. size_t size,
  212. unsigned int flags)
  213. {
  214. size_t link_end, rewind = 0, domain_len;
  215. if (size < 4 || data[1] != '/' || data[2] != '/')
  216. return 0;
  217. while (rewind < offset && isalpha(data[-((ptrdiff_t)rewind) - 1]))
  218. rewind++;
  219. if (!sd_autolink_issafe(data - rewind, size + rewind))
  220. return 0;
  221. link_end = strlen("://");
  222. domain_len = check_domain(
  223. data + link_end,
  224. size - link_end,
  225. flags & SD_AUTOLINK_SHORT_DOMAINS);
  226. if (domain_len == 0)
  227. return 0;
  228. link_end += domain_len;
  229. while (link_end < size && !isspace(data[link_end]))
  230. link_end++;
  231. link_end = autolink_delim(data, link_end, offset, size);
  232. if (link_end == 0)
  233. return 0;
  234. bufput(opaque, allocate, link, data - rewind, link_end + rewind);
  235. *rewind_p = rewind;
  236. return link_end;
  237. }
  238. size_t
  239. sd_autolink__subreddit(void* opaque, void* (*allocate)(void *opaque, size_t size), size_t *rewind_p, struct buf *link, uint8_t *data, size_t offset, size_t size)
  240. {
  241. size_t link_end;
  242. int is_allminus = 0;
  243. if (size < 3)
  244. return 0;
  245. /* make sure this / is part of /r/ */
  246. if (strncmp((char*)data, "/r/", 3) != 0)
  247. return 0;
  248. link_end = strlen("/r/");
  249. if (strncasecmp((char*)data + link_end, "all-", 4) == 0)
  250. is_allminus = 1;
  251. do {
  252. size_t start = link_end;
  253. int max_length = 24;
  254. /* special case: /r/reddit.com (only subreddit containing '.'). */
  255. if ( size >= link_end+10 && strncasecmp((char*)data+link_end, "reddit.com", 10) == 0 ) {
  256. link_end += 10;
  257. /* Make sure there are no trailing characters (don't do
  258. * any autolinking for /r/reddit.commission) */
  259. max_length = 10;
  260. }
  261. /* If not a special case, verify it begins with (t:)?[A-Za-z0-9] */
  262. else {
  263. /* support autolinking to timereddits, /r/t:when (1 April 2012) */
  264. if ( size > link_end+2 && strncasecmp((char*)data+link_end, "t:", 2) == 0 )
  265. link_end += 2; /* Jump over the 't:' */
  266. /* the first character of a subreddit name must be a letter or digit */
  267. if (!isalnum(data[link_end]))
  268. return 0;
  269. link_end += 1;
  270. }
  271. /* consume valid characters ([A-Za-z0-9_]) until we run out */
  272. while (link_end < size && (isalnum(data[link_end]) ||
  273. data[link_end] == '_'))
  274. link_end++;
  275. /* valid subreddit names are between 3 and 21 characters, with
  276. * some subreddits having 2-character names. Don't bother with
  277. * autolinking for anything outside this length range.
  278. * (chksrname function in reddit/.../validator.py) */
  279. if ( link_end-start < 2 || link_end-start > max_length )
  280. return 0;
  281. /* If we are linking to a multireddit, continue */
  282. } while ( link_end < size && (data[link_end] == '+' || (is_allminus && data[link_end] == '-')) && link_end++ );
  283. if (link_end < size && data[link_end] == '/') {
  284. while (link_end < size && (isalnum(data[link_end]) ||
  285. data[link_end] == '_' ||
  286. data[link_end] == '/' ||
  287. data[link_end] == '-'))
  288. link_end++;
  289. }
  290. /* make the link */
  291. bufput(opaque, allocate, link, data, link_end);
  292. *rewind_p = 0;
  293. return link_end;
  294. }
  295. size_t
  296. sd_autolink__username(void* opaque, void* (*allocate)(void *opaque, size_t size), size_t *rewind_p, struct buf *link, uint8_t *data, size_t offset, size_t size)
  297. {
  298. size_t link_end;
  299. if (size < 6)
  300. return 0;
  301. /* make sure this / is part of /u/ */
  302. if (strncmp((char*)data, "/u/", 3) != 0)
  303. return 0;
  304. /* the first letter of a username must... well, be valid, we don't care otherwise */
  305. link_end = strlen("/u/");
  306. if (!isalnum(data[link_end]) && data[link_end] != '_' && data[link_end] != '-')
  307. return 0;
  308. link_end += 1;
  309. /* consume valid characters ([A-Za-z0-9_-/]) until we run out */
  310. while (link_end < size && (isalnum(data[link_end]) ||
  311. data[link_end] == '_' ||
  312. data[link_end] == '/' ||
  313. data[link_end] == '-'))
  314. link_end++;
  315. /* make the link */
  316. bufput(opaque, allocate, link, data, link_end);
  317. *rewind_p = 0;
  318. return link_end;
  319. }