PageRenderTime 62ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/MusicBrainz/Server/Validation.pm

https://github.com/navap/musicbrainz-server
Perl | 474 lines | 353 code | 70 blank | 51 comment | 61 complexity | 4720337219cc4f5b5b612824e6cee0c2 MD5 | raw file
  1. #!/usr/local/perl58/bin/perl -w
  2. # vi: set ts=4 sw=4 :
  3. #____________________________________________________________________________
  4. #
  5. # MusicBrainz -- the open internet music database
  6. #
  7. # Copyright (C) 2000 Robert Kaye
  8. #
  9. # This program is free software; you can redistribute it and/or modify
  10. # it under the terms of the GNU General Public License as published by
  11. # the Free Software Foundation; either version 2 of the License, or
  12. # (at your option) any later version.
  13. #
  14. # This program is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program; if not, write to the Free Software
  21. # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22. #
  23. # $Id: MusicBrainz.pm 8398 2006-08-13 01:45:27Z nikki $
  24. #____________________________________________________________________________
  25. package MusicBrainz::Server::Validation;
  26. require Exporter;
  27. {
  28. our @ISA = qw( Exporter );
  29. our @EXPORT_OK = qw(
  30. unaccent_utf16
  31. is_integer
  32. is_non_negative_integer
  33. is_positive_integer
  34. is_database_row_id
  35. is_database_bigint_id
  36. is_guid
  37. trim_in_place
  38. is_valid_iswc
  39. format_iswc
  40. is_valid_ipi
  41. format_ipi
  42. is_valid_isni
  43. format_isni
  44. is_valid_url
  45. is_freedb_id
  46. is_valid_discid
  47. is_valid_barcode
  48. is_valid_ean
  49. is_valid_isrc
  50. format_isrc
  51. is_valid_time
  52. is_valid_setlist
  53. is_valid_iso_3166_1
  54. is_valid_iso_3166_2
  55. is_valid_iso_3166_3
  56. is_valid_partial_date
  57. is_valid_edit_note
  58. encode_entities
  59. normalise_strings
  60. is_nat
  61. validate_coordinates
  62. )
  63. }
  64. use strict;
  65. use Carp qw( carp cluck croak );
  66. use List::AllUtils qw( any );
  67. use Encode qw( decode encode );
  68. use Scalar::Util qw( looks_like_number );
  69. use Text::Unaccent qw( unac_string_utf16 );
  70. use MusicBrainz::Server::Constants qw( $MAX_POSTGRES_INT $MAX_POSTGRES_BIGINT );
  71. use utf8;
  72. sub unaccent_utf16 ($)
  73. {
  74. my $str = shift;
  75. return ( defined $str ? unac_string_utf16(''.$str) : '' );
  76. }
  77. ################################################################################
  78. # Validation and sanitisation section
  79. ################################################################################
  80. sub is_integer
  81. {
  82. my $t = shift;
  83. defined($t) and not ref($t) and $t =~ /\A(-?[0-9]{1,20})\z/;
  84. }
  85. sub is_non_negative_integer {
  86. my $t = shift;
  87. is_integer($t) and $t >= 0;
  88. }
  89. sub is_positive_integer
  90. {
  91. my $t = shift;
  92. is_integer($t) and $t > 0;
  93. }
  94. sub is_database_row_id {
  95. my $t = shift;
  96. is_positive_integer($t) and $t <= $MAX_POSTGRES_INT;
  97. }
  98. sub is_database_bigint_id {
  99. my $t = shift;
  100. is_positive_integer($t) and $t <= $MAX_POSTGRES_BIGINT;
  101. }
  102. sub is_guid
  103. {
  104. my $t = $_[0];
  105. defined($t) and not ref($t) or return undef;
  106. length($t) == 36 or return undef;
  107. $t =~ /[^0-]/ or return undef;
  108. $t = lc $t;
  109. $t =~ /\A(
  110. [0-9a-f]{8}
  111. - [0-9a-f]{4}
  112. - [0-9a-f]{4}
  113. - [0-9a-f]{4}
  114. - [0-9a-f]{12}
  115. )\z/x or return undef;
  116. $_[0] = $1;
  117. 1;
  118. }
  119. sub trim_in_place
  120. {
  121. carp "Uninitialized value passed to trim_in_place"
  122. if grep { not defined } @_;
  123. for (@_)
  124. {
  125. $_ = "" if not defined;
  126. # TODO decode, trim, encode?
  127. s/\A\s+//;
  128. s/\s+\z//;
  129. s/\s+/ /g;
  130. }
  131. }
  132. sub is_valid_iswc
  133. {
  134. my $iswc = shift;
  135. $iswc =~ s/\s//g;
  136. return $iswc =~ /^T-?[0-9]{3}\.?[0-9]{3}\.?[0-9]{3}[-.]?[0-9]$/;
  137. }
  138. sub format_iswc
  139. {
  140. my $iswc = shift;
  141. $iswc =~ s/\s//g;
  142. $iswc =~ s/^T-?([0-9]{3})\.?([0-9]{3})\.?([0-9]{3})[-.]?([0-9])/T-$1.$2.$3-$4/;
  143. return $iswc;
  144. }
  145. sub is_valid_ipi
  146. {
  147. my $ipi = shift;
  148. return $ipi =~ /^[0-9]{11}$/;
  149. }
  150. sub format_ipi
  151. {
  152. my $ipi = shift;
  153. return $ipi unless $ipi =~ /^[0-9\s.]{5,}$/;
  154. $ipi =~ s/[\s.]//g;
  155. return sprintf("%011.0f", $ipi)
  156. }
  157. sub is_valid_isni
  158. {
  159. my $isni = shift;
  160. $isni =~ s/[\s\.-]//g;
  161. return $isni =~ /^[0-9]{15}[0-9X]$/;
  162. }
  163. sub format_isni {
  164. shift =~ s/[\s\.]//gr
  165. }
  166. sub is_valid_url
  167. {
  168. my ($url) = @_;
  169. return if $url =~ /\s/;
  170. require URI;
  171. my $u = eval { URI->new($url) }
  172. or return 0;
  173. return 0 if $u->scheme eq '';
  174. return 0 if $u->can('authority') && !($u->authority =~ /\./);
  175. return 1;
  176. }
  177. sub is_freedb_id {
  178. my $id = shift;
  179. return lc($id) =~ /^[a-f0-9]{8}$/;
  180. }
  181. sub is_valid_discid
  182. {
  183. my $discid = shift;
  184. return $discid =~ /^[A-Za-z0-9._-]{27}-/;
  185. }
  186. sub is_valid_barcode
  187. {
  188. my $barcode = shift;
  189. return $barcode =~ /^[0-9]+$/;
  190. }
  191. sub is_valid_ean
  192. {
  193. my $ean = shift;
  194. my $length = length($ean);
  195. if ($length == 8 || $length == 12 || $length == 13 || $length == 14 || $length == 17 || $length == 18) {
  196. my $sum = 0;
  197. for (my $i = 2; $i <= $length; $i++) {
  198. $sum += substr($ean, $length - $i, 1) * ($i % 2 == 1 ? 1 : 3);
  199. }
  200. return ((10 - $sum % 10) % 10) == substr($ean, $length - 1, 1);
  201. }
  202. return 0;
  203. }
  204. sub format_isrc
  205. {
  206. my $isrc = shift;
  207. $isrc =~ s/[\s-]//g;
  208. return uc $isrc;
  209. }
  210. sub is_valid_isrc
  211. {
  212. my $isrc = $_[0];
  213. return $isrc =~ /^[A-Z]{2}[A-Z0-9]{3}[0-9]{7}$/;
  214. }
  215. sub is_valid_time
  216. {
  217. my $time = shift;
  218. return $time =~ /^([01][0-9]|2[0-3]):[0-5][0-9]$/;
  219. }
  220. sub is_valid_setlist
  221. {
  222. my $setlist = shift;
  223. my @invalid_lines = grep { $_ !~ /^([@#*] |\s*$)/ } split('\r\n', $setlist); return @invalid_lines ? 0 : 1;
  224. }
  225. sub is_valid_iso_3166_1
  226. {
  227. my $iso_3166_1 = shift;
  228. return $iso_3166_1 =~ /^[A-Z]{2}$/;
  229. }
  230. sub is_valid_iso_3166_2
  231. {
  232. my $iso_3166_2 = shift;
  233. return $iso_3166_2 =~ /^[A-Z]{2}-[A-Z0-9]+$/;
  234. }
  235. sub is_valid_iso_3166_3
  236. {
  237. my $iso_3166_3 = shift;
  238. return $iso_3166_3 =~ /^[A-Z]{4}$/;
  239. }
  240. sub is_valid_partial_date
  241. {
  242. my ($year, $month, $day) = @_;
  243. if (defined $month) {
  244. return 0 unless is_positive_integer($month) && $month <= 12;
  245. }
  246. if (defined $day) {
  247. return 0 unless is_positive_integer($day) && $day <= 31;
  248. }
  249. if (defined $month && $day) {
  250. return 0 if $day > 29 && $month == 2;
  251. return 0 if $day > 30 && any { $_ == $month } (4, 6, 9, 11);
  252. }
  253. if (defined $year) {
  254. return 0 unless is_integer($year);
  255. }
  256. if (defined $year && $month && $day
  257. && $month == 2 && $day == 29)
  258. {
  259. return 0 unless $year % 4 == 0;
  260. return 0 if $year % 100 == 0 && $year % 400 != 0;
  261. }
  262. if (defined $year && $month && $day) {
  263. # XXX retain legacy behaviour for now:
  264. # partial dates with year <= 0 are OK, but complete dates are not (don't ask)
  265. return 0 unless $year > 0;
  266. }
  267. return 1;
  268. }
  269. sub is_valid_edit_note
  270. {
  271. my $edit_note = shift;
  272. # An edit note with only spaces and / or punctuation is useless
  273. return 0 if $edit_note =~ /^[[:space:][:punct:]]+$/;
  274. # An edit note with just one ASCII character is useless
  275. # A one-character Japanese note (for example) might be useful, so limited to ASCII
  276. return 0 if $edit_note =~ /^[[:ascii:]]$/;
  277. return 1;
  278. }
  279. ################################################################################
  280. # Our own Mason "escape" handler
  281. ################################################################################
  282. # HTML-encoding, but only on the listed "unsafe" characters. Specifically,
  283. # don't (incorrectly) encode top-bit-set characters as &Atilde; and the like.
  284. # Hmmm. For some reason HTML::Entities just wasn't kicking in here like it is
  285. # meant to - it just left the string untouched. So, since we only need a nice
  286. # simple, fixed, substitution, we'll do it ourselves. Ugh.
  287. my %ent = ( '>' => '&gt;', '<' => '&lt;', q/"/ => '&quot;', q/'/ => '&#39;', '&' => '&amp;');
  288. sub encode_entities
  289. {
  290. my $t = $_[0];
  291. $t =~ s/([<>"'&])/$ent{$1}/go;
  292. $t;
  293. }
  294. sub normalise_strings
  295. {
  296. my @r = map {
  297. my $t = $_;
  298. # Using lc() on U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE turns it into U+0069 LATIN SMALL LETTER I
  299. # and U+0307 COMBINING DOT ABOVE which causes problems later, so remove that before using lc().
  300. # U+0131 LATIN SMALL LETTER DOTLESS I is not handled by the unaccent code, so replace that too while we're at it.
  301. $t =~ tr/\x{0130}\x{0131}/i/;
  302. # Normalise to lower case
  303. $t = lc $t;
  304. # Remove leading and trailing space
  305. $t =~ s/\A\s+//;
  306. $t =~ s/\s+\z//;
  307. # Compress whitespace
  308. $t =~ s/\s+/ /g;
  309. # Quotation marks and apostrophes
  310. # 0060 grave accent, 00B4 acute accent, 00AB <<, 00BB >>, 02BB modifier letter turned comma (for Hawaiian)
  311. # 05F3 hebrew geresh, 05F4 hebrew gershayim
  312. # 2018 left single quote, 2019 right single quote, 201A low-9 single quote, 201B high-reversed-9 single quote
  313. # 201C left double quote, 201D right double quote, 201E low-9 double quote, 201F high-reversed-9 double quote
  314. # 2032 prime, 2033 double prime, 2039 <, 203A >
  315. $t =~ tr/"\x{0060}\x{00B4}\x{00AB}\x{00BB}\x{02BB}\x{05F3}\x{05F4}\x{2018}-\x{201F}\x{2032}\x{2033}\x{2039}\x{203A}/'/;
  316. # Dashes
  317. # 05BE Hebrew maqaf, 2010 hyphen, 2012 figure dash, 2013 en-dash, 2014 em-dash, 2015 horizontal bar, 2212 minus
  318. $t =~ tr/\x{05BE}\x{2010}\x{2012}\x{2013}\x{2014}\x{2015}\x{2212}/-/;
  319. # Horizontal three-dots ellipses
  320. # 2026 horizontal ellipsis,
  321. # 22EF midline horizontal ellipsis
  322. $t =~ s/[\x{2026}\x{22EF}]/.../g;
  323. # Unaccent what's left
  324. decode("utf-16", unaccent_utf16(encode("utf-16", $t)));
  325. } @_;
  326. wantarray ? @r : $r[-1];
  327. }
  328. sub is_nat {
  329. my $n = shift;
  330. return looks_like_number($n) && int($n) == $n && $n >= 0;
  331. }
  332. sub degree {
  333. my ($degrees, $dir) = @_;
  334. return dms($degrees, 0, 0, $dir);
  335. }
  336. sub dms {
  337. my ($degrees, $minutes, $seconds, $dir) = @_;
  338. $degrees =~ s/,/./;
  339. $minutes =~ s/,/./;
  340. $seconds =~ s/,/./;
  341. return
  342. sprintf("%.6f", ((0+$degrees) + ((0+$minutes) * 60 + (0+$seconds)) / 3600) * direction($dir))
  343. + 0; # remove trailing zeroes (MBS-7438)
  344. }
  345. my %DIRECTIONS = ( n => 1, s => -1, e => 1, w => -1 );
  346. sub direction { $DIRECTIONS{lc(shift() // '')} // 1 }
  347. sub swap {
  348. my ($direction_lat, $direction_long, $lat, $long) = @_;
  349. $direction_lat //= 'n';
  350. $direction_long //= 'e';
  351. # We expect lat/long, but can support long/lat
  352. if (lc $direction_lat eq 'e' || lc $direction_lat eq 'w' ||
  353. lc $direction_long eq 'n' || lc $direction_long eq 's') {
  354. return ($long, $lat);
  355. }
  356. else {
  357. return ($lat, $long);
  358. }
  359. }
  360. sub validate_coordinates {
  361. my $coordinates = shift;
  362. if ($coordinates =~ /^\s*$/) {
  363. return undef;
  364. }
  365. my $separators = '\s?,?\s?';
  366. my $number_part = q{[0-9]+(?:[\.,][0-9]+)?};
  367. $coordinates =~ tr/ -/ .0-9/; # replace fullwidth characters with normal ASCII
  368. $coordinates =~ s/(|)\s*(${number_part})\s*(${number_part})\s*(${number_part})${separators}(|西)\s*(${number_part})\s*(${number_part})\s*(${number_part})/$2° $3' $4" $1, $6° $7' $8" $5/;
  369. $coordinates =~ tr/北南東西/NSEW/; # replace CJK direction characters
  370. my $degree_markers = q{°d};
  371. my $minute_markers = q{'};
  372. my $second_markers = q{"″};
  373. my $decimalPart = '([+\-]?'.$number_part.')\s?['. $degree_markers .']?\s?([NSEW]?)';
  374. if ($coordinates =~ /^${decimalPart}${separators}${decimalPart}$/i) {
  375. my ($lat, $long) = swap($2, $4, degree($1, $2), degree($3, $4));
  376. return {
  377. latitude => $lat,
  378. longitude => $long
  379. };
  380. }
  381. my $dmsPart = '(?:([+\-]?'.$number_part.')[:'.$degree_markers.']\s?' .
  382. '('.$number_part.')[:'.$minute_markers.']\s?' .
  383. '(?:('.$number_part.')['.$second_markers.']?)?\s?([NSEW]?))';
  384. if ($coordinates =~ /^${dmsPart}${separators}${dmsPart}$/i) {
  385. my ($lat, $long) = swap($4, $8, dms($1, $2, $3 // 0, $4), dms($5, $6, $7 // 0, $8));
  386. return {
  387. latitude => $lat,
  388. longitude => $long
  389. };
  390. }
  391. return undef;
  392. }
  393. 1;
  394. # eof Validation.pm