PageRenderTime 82ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/services/Format/Services_Format_Parsing.php

http://github.com/spotweb/spotweb
PHP | 556 lines | 290 code | 69 blank | 197 comment | 69 complexity | 96a0d23e9f89a96e2af69faf6b78132b MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, Apache-2.0, LGPL-3.0
  1. <?php
  2. class Services_Format_Parsing
  3. {
  4. private $_spotSigning = null;
  5. private $_util;
  6. public function __construct()
  7. {
  8. $this->_spotSigning = Services_Signing_Base::factory();
  9. $this->_util = new Services_Format_Util();
  10. }
  11. // ctor
  12. /*
  13. * Some Spotnet clients create invalid XML - see
  14. * messageid ZOB4WPyqQfcHqykUAES8q@spot.net for example, because
  15. * it uses an unescaped & not in an CDATA block.
  16. */
  17. private function correctElmContents($xmlStr, $elems)
  18. {
  19. $cdataStart = '<![CDATA[';
  20. $cdataEnd = ']]>';
  21. /*
  22. * replace low-ascii characters, see messageid KNCuzvnxJJErJibUAAxQJ@spot.net
  23. */
  24. $xmlStr = preg_replace('/[\x00-\x1F]/', '', $xmlStr);
  25. /* and loop through all elements and fix them up */
  26. foreach ($elems as $elementName) {
  27. // find the element entries
  28. $startElem = stripos($xmlStr, '<'.$elementName.'>');
  29. $endElem = stripos($xmlStr, '</'.$elementName.'>');
  30. if (($startElem === false) || ($endElem === false)) {
  31. continue;
  32. }
  33. /*
  34. * Make sure this elements content is not preceeded by the
  35. * required CDATA header
  36. */
  37. if (substr($xmlStr, $startElem + strlen($elementName) + 2, strlen($cdataStart)) !== $cdataStart) {
  38. $xmlStr = str_replace(
  39. ['<'.$elementName.'>', '</'.$elementName.'>'],
  40. ['<'.$elementName.'>'.$cdataStart, $cdataEnd.'</'.$elementName.'>'],
  41. $xmlStr
  42. );
  43. } // if
  44. } // foreach
  45. return $xmlStr;
  46. }
  47. // correctElmContents
  48. /*
  49. * Make string utf8mb3 for mysql (only 3 byte utf codes)
  50. */
  51. private function replace4Byte($string, $replacement = '')
  52. {
  53. return preg_replace('%(?:
  54. \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
  55. | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
  56. | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
  57. )%xs', $replacement, $string);
  58. }
  59. /*
  60. * Parse a full Spot according to the XML structure
  61. */
  62. public function parseFull($xmlStr)
  63. {
  64. // Create a template array so we always have the full fields to prevent ugly notices
  65. $tpl_spot = ['category' => '', 'website' => '', 'image' => '', 'sabnzbdurl' => '', 'messageid' => '', 'searchurl' => '', 'description' => '',
  66. 'sub' => '', 'filesize' => '', 'poster' => '', 'tag' => '', 'nzb' => [], 'title' => '',
  67. 'filename' => '', 'newsgroup' => '', 'subcata' => '', 'subcatb' => '',
  68. 'subcatc' => '', 'subcatd' => '', 'subcatz' => '', 'created' => '', 'key' => '', 'prevMsgids' => [], 'newsreader' => '', ];
  69. /*
  70. * Some legacy spotNet clients create incorrect/invalid multiple segments,
  71. * we use this crude way to workaround this. GH issue #1608
  72. */
  73. if (strpos($xmlStr, 'spot.net></Segment') !== false) {
  74. $xmlStr = str_replace(
  75. ['spot.net></Segment>', 'spot.ne</Segment>'],
  76. ['spot.net</Segment>', 'spot.net</Segment>'],
  77. $xmlStr
  78. );
  79. } // if
  80. /*
  81. * Fix up some forgotten entity encoding / cdata sections in the XML
  82. */
  83. $xmlStr = $this->correctElmContents($xmlStr, ['Title', 'Description', 'Image', 'Tag', 'Website']);
  84. /*
  85. * Supress errors for corrupt messageids, eg: <evoCgYpLlLkWe97TQAmnV@spot.net>
  86. */
  87. $xmltop = @(new SimpleXMLElement($xmlStr));
  88. $xml = $xmltop->Posting;
  89. $tpl_spot['created'] = (string) $xml->Created;
  90. $tpl_spot['key'] = (string) $xml->Key;
  91. $tpl_spot['category'] = (string) $xml->Category;
  92. $tpl_spot['website'] = (string) $xml->Website;
  93. $tpl_spot['description'] = (string) $xml->Description;
  94. $tpl_spot['filesize'] = (string) $xml->Size;
  95. $tpl_spot['poster'] = (string) utf8_encode($xml->Poster);
  96. $tpl_spot['tag'] = (string) utf8_encode($xml->Tag);
  97. $tpl_spot['title'] = (string) $xml->Title;
  98. // Decode HTML special characters, title otherwise search will be broken, description as body in newsgroup
  99. $tpl_spot['title'] = html_entity_decode($tpl_spot['title'], ENT_QUOTES, 'UTF-8');
  100. $tpl_spot['title'] = $this->replace4Byte($tpl_spot['title'], '??');
  101. $tpl_spot['description'] = html_entity_decode($tpl_spot['description'], ENT_QUOTES, 'UTF-8');
  102. $tpl_spot['description'] = $this->replace4Byte($tpl_spot['description'], '??');
  103. // FTD spots have the filename
  104. if (!empty($xml->Filename)) {
  105. $tpl_spot['filename'] = (string) $xml->Filename;
  106. } // if
  107. // FTD spots have the newsgroup
  108. if (!empty($xml->Newsgroup)) {
  109. $tpl_spot['newsgroup'] = (string) $xml->newsgroup;
  110. } // if
  111. /*
  112. * Images available can be in the XML in two different ways.
  113. *
  114. * Some older spots just have an URL we can use, newer spots
  115. * have an height/width/messageid(s) pair we use to retrieve the image
  116. * from
  117. */
  118. if (empty($xml->Image->Segment)) {
  119. $tpl_spot['image'] = (string) $xml->Image;
  120. } else {
  121. $tpl_spot['image'] = [
  122. 'height' => (string) $xml->Image['Height'],
  123. 'width' => (string) $xml->Image['Width'],
  124. ];
  125. foreach ($xml->xpath('/Spotnet/Posting/Image/Segment') as $seg) {
  126. // Make sure the messageid's are valid so we do not throw an NNTP error
  127. if (!$this->_util->validMessageId((string) $seg)) {
  128. $tpl_spot['image']['segment'] = [];
  129. break;
  130. } else {
  131. $tpl_spot['image']['segment'][] = (string) $seg;
  132. } // if
  133. } // foreach
  134. } // else
  135. // Just stitch together the NZB segments
  136. foreach ($xml->xpath('/Spotnet/Posting/NZB/Segment') as $seg) {
  137. if (!$this->_util->validMessageId((string) $seg)) {
  138. $tpl_spot['nzb'] = [];
  139. break;
  140. } else {
  141. $tpl_spot['nzb'][] = (string) $seg;
  142. } // else
  143. } // foreach
  144. // PREVSPOTS
  145. if (!empty($xml->PREVSPOTS->Spot)) {
  146. foreach ($xml->xpath('/Spotnet/Posting/PREVSPOTS/Spot') as $seg) {
  147. // Make sure the messageid's are valid so we do not throw an NNTP error
  148. if ($this->_util->validMessageId((string) $seg)) {
  149. $tpl_spot['prevMsgids'][] = (string) $seg;
  150. } // if
  151. } // foreach
  152. } // else
  153. // Extra / newsreader
  154. if (!empty($xmltop->Extra->Newsreader)) {
  155. $tpl_spot['newsreader'] = (string) $xmltop->Extra->Newsreader;
  156. }
  157. // fix the category in the XML array but only for new spots
  158. if ((int) $xml->Key != 1) {
  159. $tpl_spot['category'] = ((int) $tpl_spot['category']) - 1;
  160. } // if
  161. /*
  162. * For FTD spots an array of subcategories is created. This array is not
  163. * compatible with that of newer spots so we need two seperate codepaths
  164. */
  165. $subcatList = [];
  166. /*
  167. * We fix up the category list later in the system, so we just extract the
  168. * list of subcategories
  169. */
  170. if (!empty($xml->SubCat)) {
  171. foreach ($xml->xpath('/Spotnet/Posting/Category/SubCat') as $sub) {
  172. $subcatList[] = (string) $sub;
  173. } // foreach
  174. } else {
  175. foreach ($xml->xpath('/Spotnet/Posting/Category/Sub') as $sub) {
  176. $subcatList[] = (string) $sub;
  177. } // foreach
  178. } // if
  179. /*
  180. * Mangle the several types of subcategory listing to make sure we only
  181. * have to use one type in the rest of Spotwb
  182. */
  183. foreach ($subcatList as $subcat) {
  184. if (preg_match('/(\d+)([aAbBcCdDzZ])(\d+)/', preg_quote($subcat), $tmpMatches)) {
  185. $subCatVal = strtolower($tmpMatches[2]).((int) $tmpMatches[3]);
  186. $tpl_spot['subcat'.$subCatVal[0]] .= $subCatVal.'|';
  187. } // if
  188. } // foreach
  189. /*
  190. * subcatz is a subcategory introduced in later Spotnet formats, we prefer to
  191. * always have this subcategory so we just fake it if it's not listed.
  192. */
  193. if (empty($tpl_spot['subcatz'])) {
  194. $tpl_spot['subcatz'] = SpotCategories::createSubcatZ($tpl_spot['category'], $tpl_spot['subcata'].$tpl_spot['subcatb'].$tpl_spot['subcatd']);
  195. } // if
  196. // map deprecated genre categories to their new genre category
  197. $tpl_spot['subcatd'] = SpotCategories::mapDeprecatedGenreSubCategories($tpl_spot['category'], $tpl_spot['subcatd'], $tpl_spot['subcatz']);
  198. $tpl_spot['subcatc'] = SpotCategories::mapLanguageSubCategories($tpl_spot['category'], $tpl_spot['subcatc'], $tpl_spot['subcatz']);
  199. // and return the parsed XML
  200. return $tpl_spot;
  201. }
  202. // parseFull()
  203. /*
  204. * Parse a Spot using only the header information
  205. */
  206. public function parseHeader($subj, $from, $date, $messageid, $rsaKeys)
  207. {
  208. // Initialize an empty array, we create a basic template in a few
  209. $spot = [];
  210. /*
  211. * The "From" header is created using the following system:
  212. *
  213. * From: [Nickname] <[RANDOM or PUBLICKEY]@[CAT][KEY-ID][SUBCAT].[SIZE].[RANDOM].[DATE].[CUSTOM-ID].[CUSTOM-VALUE].[SIGNATURE]>
  214. * or
  215. * From: [Nickname] <[PUBLICKEY-MODULO.USERSIGNATURE]@[CAT][KEY-ID][SUBCAT].[SIZE].[RANDOM].[DATE].[CUSTOM-ID].[CUSTOM-VALUE].[SIGNATURE]>
  216. *
  217. *
  218. * First we want to extract everything after the @ but because a nickname could contain an @, we have to mangle it a bit
  219. */
  220. $fromInfoPos = strpos($from, '<');
  221. if ($fromInfoPos === false) {
  222. return false;
  223. } else {
  224. // Remove the posters' name and the <> characters
  225. $fromAddress = explode('@', substr($from, $fromInfoPos + 1, -1));
  226. if (count($fromAddress) < 2) {
  227. return false;
  228. } // if
  229. $spot['header'] = $fromAddress[1];
  230. /*
  231. * It is possible the part before the @ contains both the
  232. * users' signature as the spots signature as signed by the user
  233. */
  234. $headerSignatureTemp = explode('.', $fromAddress[0]);
  235. $spot['selfsignedpubkey'] = $this->_util->spotUnprepareBase64($headerSignatureTemp[0]);
  236. if (isset($headerSignatureTemp[1])) {
  237. $spot['user-signature'] = $this->_util->spotUnprepareBase64($headerSignatureTemp[1]);
  238. } // if
  239. } // if
  240. /*
  241. * Initialize some basic variables. We set 'verified' to false so we can
  242. * exit this function at any time and the gathered data for this spot up til
  243. * then is stil ignored.
  244. */
  245. $spot['verified'] = false;
  246. $spot['filesize'] = 0;
  247. $spot['messageid'] = $messageid;
  248. $spot['stamp'] = strtotime($date);
  249. /*
  250. * Split the .-delimited fields into an array so we can mangle it. We require
  251. * atleast six fields, if any less we can safely assume the spot is invalid
  252. */
  253. $fields = explode('.', $spot['header']);
  254. if (count($fields) < 6) {
  255. return false;
  256. } // if
  257. /*
  258. * Extract the fixed fields from the header
  259. */
  260. $spot['poster'] = substr($from, 0, $fromInfoPos - 1);
  261. $spot['category'] = (int) (substr($fields[0], 0, 1)) - 1.0;
  262. $spot['keyid'] = (int) substr($fields[0], 1, 1);
  263. $spot['filesize'] = $fields[1];
  264. $spot['subcata'] = '';
  265. $spot['subcatb'] = '';
  266. $spot['subcatc'] = '';
  267. $spot['subcatd'] = '';
  268. $spot['subcatz'] = '';
  269. $spot['wassigned'] = false;
  270. $spot['spotterid'] = '';
  271. $isRecentKey = $spot['keyid'] != 1;
  272. /*
  273. * If the keyid is invalid, abort trying to parse it
  274. */
  275. if ($spot['keyid'] < 0) {
  276. return false;
  277. } // if
  278. /*
  279. * Listings of subcategories is dependent on the age of the spot.
  280. *
  281. * FTD spots just list all subcategories like: a9b4c0d5d15d11
  282. * Newer spots always use three characters for each subcategory like: a09b04c00d05d15d11.
  283. *
  284. * We really do not care for this, we just parse them using the same code as the
  285. * first one.
  286. *
  287. * We pad $strCatList with an extra set of tokes so we always parse te last category,
  288. * we make sure any sanitycheck is passed by adding 3 tokens.
  289. */
  290. $strCatList = strtolower(substr($fields[0], 2)).'!!!';
  291. $strCatListLen = strlen($strCatList);
  292. /*
  293. * Initialize some basic variables to use for sanitychecking (eg: valid subcats)
  294. */
  295. $validSubcats = ['a' => true, 'b' => true, 'c' => true, 'd' => true, 'z' => true];
  296. $tmpCatBuild = '';
  297. /* And just try to extract all given subcategories */
  298. for ($i = 0; $i < $strCatListLen; $i++) {
  299. /*
  300. * If the current character is not an number, we found the next
  301. * subcategory. Add the current one to the list, and start
  302. * parsing the new one
  303. */
  304. if ((!is_numeric($strCatList[$i])) && (!empty($tmpCatBuild))) {
  305. if (isset($validSubcats[$tmpCatBuild[0]])) {
  306. $spot['subcat'.$tmpCatBuild[0]] .= $tmpCatBuild[0].(int) substr($tmpCatBuild, 1).'|';
  307. } // if
  308. $tmpCatBuild = '';
  309. } // if
  310. $tmpCatBuild .= $strCatList[$i];
  311. } // for
  312. /*
  313. * subcatz is a subcategory introduced in later Spotnet formats, we prefer to
  314. * always have this subcategory so we just fake it if it's not listed.
  315. */
  316. if (empty($spot['subcatz'])) {
  317. $spot['subcatz'] = SpotCategories::createSubcatz($spot['category'], $spot['subcata'].$spot['subcatb'].$spot['subcatd']);
  318. } // if
  319. // map deprecated genre categories to their new genre category
  320. $spot['subcatd'] = SpotCategories::mapDeprecatedGenreSubCategories($spot['category'], $spot['subcatd'], $spot['subcatz']);
  321. $spot['subcatc'] = SpotCategories::mapLanguageSubCategories($spot['category'], $spot['subcatc'], $spot['subcatz']);
  322. if ((strpos($subj, '=?') !== false) && (strpos($subj, '?=') !== false)) {
  323. // This is an old format to parse, instantiate the legacy parsing
  324. $legacyParser = new Services_Format_ParsingLegacy();
  325. // Make sure its as simple as possible
  326. $subj = str_replace('?= =?', '?==?', $subj);
  327. $subj = str_replace('\r', '', trim($legacyParser->oldEncodingParse($subj)));
  328. $subj = str_replace('\n', '', $subj);
  329. } // if
  330. if ($isRecentKey) {
  331. $tmp = explode('|', $subj);
  332. $spot['title'] = trim($tmp[0]);
  333. if (count($tmp) > 1) {
  334. $spot['tag'] = trim($tmp[1]);
  335. } else {
  336. $spot['tag'] = '';
  337. } // else
  338. } else {
  339. $tmp = explode('|', $subj);
  340. if (count($tmp) <= 1) {
  341. $tmp = [$subj];
  342. } // if
  343. $spot['tag'] = trim($tmp[count($tmp) - 1]);
  344. // remove the tags from the array
  345. array_pop($tmp);
  346. array_pop($tmp);
  347. $spot['title'] = trim(implode('|', $tmp));
  348. if ((strpos($spot['title'], chr(0xc2)) !== false) | (strpos($spot['title'], chr(0xc3)) !== false)) {
  349. // This is an old format to parse, instantiate the legacy parsing
  350. $legacyParser = new Services_Format_ParsingLegacy();
  351. $spot['title'] = trim($legacyParser->oldEncodingParse($spot['title']));
  352. } // if
  353. } // if recentKey
  354. // Title and poster fields are mandatory, we require it to validate the signature
  355. if (((strlen($spot['title']) == 0) || (strlen($spot['poster']) == 0))) {
  356. return $spot;
  357. } // if
  358. /*
  359. * For any recentkey ( >1) or spots created after year-2010, we require the spot
  360. * to be signed
  361. */
  362. $mustbeSigned = $isRecentKey | ($spot['stamp'] > 1293870080);
  363. if ($mustbeSigned) {
  364. $spot['headersign'] = $fields[count($fields) - 1];
  365. $spot['wassigned'] = (strlen($spot['headersign']) != 0);
  366. } // if must be signed
  367. else {
  368. $spot['verified'] = true;
  369. $spot['wassigned'] = false;
  370. } // if doesnt need to be signed, pretend that it is
  371. /*
  372. * Don't verify spots which are already verified
  373. */
  374. if ($spot['wassigned']) {
  375. /*
  376. * There are currently two known methods to which Spots are signed,
  377. * each having different charachteristics, making it a bit difficult
  378. * to work with this.
  379. *
  380. * The oldest method uses a secret private key and a signing server, we
  381. * name this method SPOTSIGN_V1. The users' public key is only available
  382. * in the XML header, not in the From header. This is the preferred method.
  383. *
  384. * The second method uses a so-called "self signed" spot (the spotter signs
  385. * the spots, posts the public key in the header and a hashcash is used to
  386. * prevent spamming). This method is called SPOTSIGN_V2.
  387. *
  388. */
  389. if ($spot['keyid'] == 7) {
  390. /*
  391. * KeyID 7 has a special meaning, it defines a self-signed spot and
  392. * requires a hashcash
  393. */
  394. $signingMethod = 2;
  395. } else {
  396. $signingMethod = 1;
  397. } // else
  398. switch ($signingMethod) {
  399. case 1:
  400. // the signature this header is signed with
  401. $signature = $this->_util->spotUnprepareBase64($spot['headersign']);
  402. /*
  403. * Make sure the key specified is an actual known key
  404. */
  405. if (isset($rsaKeys[$spot['keyid']])) {
  406. if ($spot['keyid'] == 2 && $spot['filesize'] = 999 && strlen($spot['selfsignedpubkey']) > 50
  407. ) {
  408. /* Check personal dispose message */
  409. $signature = $this->_util->spotUnprepareBase64($spot['headersign']);
  410. $userSignedHash = sha1('<'.$spot['messageid'].'>', false);
  411. $spot['verified'] = (substr($userSignedHash, 0, 4) === '0000');
  412. if ($spot['verified']) {
  413. $userRsaKey = [2 => ['modulo' => $spot['selfsignedpubkey'], 'exponent' => 'AQAB']];
  414. if ($this->_spotSigning->verifySpotHeader($spot, $signature, $userRsaKey)) {
  415. $spot['spotterid'] = $this->_util->calculateSpotterId($spot['selfsignedpubkey']);
  416. } // if
  417. } // if
  418. } else {
  419. $spot['verified'] = $this->_spotSigning->verifySpotHeader($spot, $signature, $rsaKeys);
  420. }
  421. } // if
  422. break;
  423. // SPOTSIGN_V1
  424. case 2:
  425. // the signature this header is signed with
  426. $signature = $this->_util->spotUnprepareBase64($spot['headersign']);
  427. $userSignedHash = sha1('<'.$spot['messageid'].'>', false);
  428. $spot['verified'] = (substr($userSignedHash, 0, 4) === '0000');
  429. /*
  430. * Create a fake RSA keyarray so we can validate it using our standard
  431. * infrastructure
  432. */
  433. if ($spot['verified']) {
  434. $userRsaKey = [7 => ['modulo' => $spot['selfsignedpubkey'],
  435. 'exponent' => 'AQAB', ]];
  436. /*
  437. * We cannot use this as a full measure to check the spot's validness yet,
  438. * because at least one Spotnet client feeds us invalid data for now
  439. */
  440. if ($this->_spotSigning->verifySpotHeader($spot, $signature, $userRsaKey)) {
  441. /*
  442. * The users' public key (modulo) is posted in the header, lets
  443. * try this.
  444. */
  445. $spot['spotterid'] = $this->_util->calculateSpotterId($spot['selfsignedpubkey']);
  446. } // if
  447. } // if
  448. break;
  449. // SPOTSIGN_V2
  450. } // switch
  451. /*
  452. * Even more recent spots, contain the users' full publickey
  453. * in the header. This allows us to uniquely identify and verify
  454. * the poster of the spot.
  455. *
  456. * Try to extract this information.
  457. */
  458. if (($spot['verified']) && (!empty($spot['user-signature'])) && (!empty($spot['selfsignedpubkey']))) {
  459. /*
  460. * Extract the public key
  461. */
  462. $spot['spotterid'] = $this->_util->calculateSpotterId($spot['selfsignedpubkey']);
  463. $spot['user-key'] = ['modulo' => $spot['selfsignedpubkey'],
  464. 'exponent' => 'AQAB', ];
  465. /*
  466. * The spot contains the signature in the header of the spot
  467. */
  468. $spot['verified'] = $this->_spotSigning->verifyFullSpot($spot);
  469. } // if
  470. } // if was signed
  471. /*
  472. * We convert the title and other fields to UTF8, we cannot
  473. * do this any earlier because it would break the RSA signature
  474. */
  475. if (($spot !== false) && ($spot['verified'])) {
  476. $spot['title'] = utf8_encode($spot['title']);
  477. $spot['poster'] = utf8_encode($spot['poster']);
  478. $spot['tag'] = utf8_encode($spot['tag']);
  479. // If a spot is in the future, fix it
  480. if (time() < $spot['stamp']) {
  481. $spot['stamp'] = time();
  482. } // if
  483. } // if
  484. return $spot;
  485. }
  486. // parseHeader
  487. } // class Services_Format_Parsing