PageRenderTime 56ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/src/php/document/get-metadata.php

https://bitbucket.org/silverasm/wordseer
PHP | 1031 lines | 958 code | 33 blank | 40 comment | 101 complexity | 721b5ef5db6926fbc847e6516dcc6cd8 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0, BSD-3-Clause
  1. <?php
  2. /* Copyright 2012 Aditi Muralidharan. See the file "LICENSE" for the full license governing this code. */
  3. /** get-metadata.php
  4. **/
  5. include_once '../util.php';
  6. $wordseer_instance = getGetParam('instance');
  7. $path = '../../../instances/'.$wordseer_instance.'/config.php';
  8. include_once $path;
  9. /** Create a temporary table to hold the results of sentence ID's that match
  10. various filters.
  11. */
  12. global $num_filter_conditions;
  13. global $num_search_conditions;
  14. global $created;
  15. global $main_metadata_counts_table;
  16. global $main_metadata_counts_table_index;
  17. $main_metadata_counts_table_index = 0;
  18. $main_metadata_counts_table = "main_metadata_counts_".$main_metadata_counts_table_index;
  19. $sql = "SET session tmp_table_size = 1073741824;";
  20. mysql_query($sql);
  21. $sql = "SET session max_heap_table_size = 1073741824;";
  22. mysql_query($sql);
  23. resetSentenceFilters();
  24. if (strstr($_SERVER['REQUEST_URI'], 'get-metadata.php')) {
  25. metadata_dispatch(); // All the way at the bottom of this file.
  26. }
  27. function resetSentenceFilters() {
  28. global $num_filter_conditions;
  29. global $num_search_conditions;
  30. global $created;
  31. if (!$created) {
  32. $sql = "DROP TEMPORARY TABLE IF EXISTS `filtered_sent_ids`;";
  33. mysql_query($sql) or die (mysql_error(). " On <br> $sql");
  34. $sql = "CREATE TEMPORARY TABLE `filtered_sent_ids` (
  35. `id` int(11) NOT NULL DEFAULT '0',
  36. `document_id` int(11) NOT NULL DEFAULT '0',
  37. `num_matched` int(11) NOT NULL DEFAULT '1',
  38. `num_searches_matched` int(11) NOT NULL DEFAULT '0',
  39. PRIMARY KEY (`id`),
  40. KEY `num_matched` (`num_matched`, `id`),
  41. KEY `num_searches_matched` (`num_searches_matched`, `id`)
  42. ) ENGINE=MEMORY DEFAULT CHARSET=utf8";
  43. mysql_query($sql) or die (mysql_error()." on:
  44. <br> $sql get-metadata.php l 507");
  45. $num_filter_conditions = 0;
  46. $num_search_conditions = 0;
  47. }
  48. $created = true;
  49. }
  50. function resetUnitsFilter() {
  51. $sql = "DROP TEMPORARY TABLE IF EXISTS `filtered_unit_ids`;";
  52. mysql_query($sql) or die (mysql_error(). " On <br> $sql");
  53. $sql = "CREATE TEMPORARY TABLE `filtered_unit_ids` (
  54. `unit_id` int(11) NOT NULL DEFAULT '0',
  55. `sentence_id` int(11) NOT NULL DEFAULT '0',
  56. `document_id` int(11) NOT NULL DEFAULT '0',
  57. PRIMARY KEY (`unit_id`, `sentence_id`)
  58. ) ENGINE=MyISAM DEFAULT CHARSET=utf8";
  59. mysql_query($sql) or die (mysql_error()." on:
  60. <br> $sql <br> at get-metadata.php l 55");
  61. }
  62. /** Returns the document ID's that satisfy all the given metadata filters */
  63. function getDocumentIDsForMetadata($metadata){
  64. $timing = getGetParam('timing');
  65. $unitSets = getUnitsForMetadata($metadata);
  66. if($unitSets != "all"){
  67. $t1 = time();
  68. $units_intersection_sql = "";
  69. $unit_set_ids_strings = array();
  70. if (count($unitSets) > 0) {
  71. foreach ($unitSets as $unitSet) {
  72. $id_string = "(unit_id IN (".join(", ", $unitSet)."))";
  73. array_push($unit_set_ids_strings, $id_string);
  74. }
  75. $units_intersection_sql = "AND (".join( "
  76. AND ",
  77. $unit_set_ids_strings).") ";
  78. } else {
  79. $units_intersection_sql = " AND FALSE ";
  80. }
  81. $sql = "SELECT document_id from document_structure
  82. WHERE TRUE $units_intersection_sql;";
  83. $result = mysql_query($sql) or die("Error getting sentence IDs for
  84. metadata <br>
  85. ".mysql_error()."
  86. <br> on query
  87. <br>".$sql);
  88. $document_ids = array();
  89. while ($row = mysql_fetch_assoc($result)) {
  90. array_push($document_ids, $row["document_id"]);
  91. }
  92. $t2 = time();
  93. if ($timing) {
  94. echo "<br> SQL to get matching document ID's:<br> $sql <br>";
  95. echo "Time to get documentIDs for metadata: ".($t2 - $t1)."s<br>";
  96. }
  97. return $document_ids;
  98. }else{
  99. return "all";
  100. }
  101. }
  102. function getDocumentIDsForMetadataAndCollection($metadata, $collection) {
  103. global $timing;
  104. $timing = getGetParam('timing');
  105. $t1 = time();
  106. $document_ids = array();
  107. // Apply the metadata filter and get the resulting document IDs.
  108. $filtered_by_metadata = getDocumentIDsForMetadata($metadata);
  109. $metadata_filter_active = ($filtered_by_metadata != 'all');
  110. if ($timing) {
  111. echo "
  112. <br>Filtered document IDs: ".json_encode($filtered_by_metadata)."
  113. <br>";
  114. }
  115. // Apply the collections filter and get the resulting document IDs.
  116. $collection_filter_active =($collection != false && $collection != 'all');
  117. $filtered_by_collection = array();
  118. if($collection_filter_active){
  119. // subsets/read.php contains collection maniputlations.
  120. $filtered_by_collection = getDocumentIDsInCollection($collection);
  121. }
  122. if ($metadata_filter_active && $collection_filter_active) {
  123. $document_ids = array_intersect($filtered_by_metadata,
  124. $filtered_by_collection);
  125. } else if ($metadata_filter_active) {
  126. $document_ids = $filtered_by_metadata;
  127. } else if ($collection_filter_active) {
  128. $document_ids = $filtered_by_collection;
  129. } else {
  130. $document_ids = "all";
  131. }
  132. $t2 = time();
  133. if ($timing) {
  134. echo "All matching document IDs:<br>
  135. ".json_encode($document_ids)."
  136. <br>";
  137. echo "Time to get all matching document IDs: ".($t2 - $t1)."s<br><br>";
  138. }
  139. return $document_ids;
  140. }
  141. /** Returns "all" if there are no active filters, but a list of sentence IDs
  142. otherwise **/
  143. function getSentenceIDsForMetadataAndCollection($metadata, $collection) {
  144. $timing = getGetParam('timing');
  145. $t1 = time();
  146. $sentence_ids = array();
  147. // Apply the metadata filter and get the resulting sentence IDs.
  148. $filtered_by_metadata = getSentenceIDsForMetadata($metadata);
  149. $metadata_filter_active = ($filtered_by_metadata != 'all');
  150. // Apply the collections filter and get the resulting sentence IDs.
  151. $collection_filter_active =($collection != false && $collection != 'all');
  152. $filtered_by_collection = array();
  153. if($collection_filter_active){
  154. // subsets/read.php contains collection maniputlations.
  155. $filtered_by_collection = getSentenceIDsInCollection($collection);
  156. }
  157. if ($metadata_filter_active && $collection_filter_active) {
  158. $sentence_ids = array_intersect($filtered_by_metadata,
  159. $filtered_by_collection);
  160. } else if ($metadata_filter_active) {
  161. $sentence_ids = $filtered_by_metadata;
  162. } else if ($collection_filter_active) {
  163. $sentence_ids = $filtered_by_collection;
  164. } else {
  165. $sentence_ids = "all";
  166. }
  167. $t2 = time();
  168. if ($timing) {
  169. echo "<br>Total time to get sentence ids matching collection and metadata
  170. filters: ".($t2 - $t1)."s<br>";
  171. }
  172. return $sentence_ids;
  173. }
  174. function getSentenceIDsForMetadata($metadata) {
  175. $timing = getGetParam('timing');
  176. global $num_filter_conditions;
  177. $table_identifier = 'filtered_sent_ids';
  178. $insertion_fields = '(id, document_id)';
  179. $field_identifier = "DISTINCT sentence_id, document_id";
  180. $query_id_where = '';
  181. global $cache_results;
  182. global $query_id;
  183. if ($cache_results || $query_id) {
  184. $table_identifier = 'cached_filtered_sent_ids';
  185. $insertion_fields = '(id, document_id, query_id)';
  186. $field_identifier = "DISTINCT sentence_id, document_id, $query_id";
  187. $query_id_where = " AND query_id = $query_id ";
  188. }
  189. $alt_field_identifier = str_replace("sentence_id", "id",
  190. $field_identifier);
  191. $t1 = time();
  192. $unitSets = getUnitsForMetadata($metadata);
  193. $t2 = time();
  194. if ($timing) {
  195. echo "Time to get unit ids for metadata: ".($t2 - $t1)."s<br><br>";
  196. }
  197. if($unitSets != "all"){
  198. if (!$query_id || $cache_results) {
  199. if (count($unitSets) > 0) {
  200. foreach ($unitSets as $unitSet) {
  201. $units_where_clause = makeUnitsWhereClause($unitSet);
  202. $sql = "INSERT INTO $table_identifier $insertion_fields
  203. SELECT $field_identifier
  204. FROM sentence_xref_unit, sentence
  205. WHERE $units_where_clause
  206. AND sentence_xref_unit.sentence_id = sentence.id
  207. ON DUPLICATE KEY update num_matched = num_matched + 1;";
  208. $results = mysql_query($sql) or die (mysql_error()."<br> on
  209. $sql <br> get-metadata.php l. 210");
  210. // in case the units are sentences;
  211. $id_where_clause = makeIdWhereClause($unitSet);
  212. $sql = "INSERT INTO $table_identifier $insertion_fields
  213. SELECT $alt_field_identifier
  214. FROM sentence
  215. WHERE $id_where_clause
  216. ON DUPLICATE KEY update num_matched = num_matched + 1;";
  217. $results = mysql_query($sql) or die (mysql_error()."<br> on
  218. $sql <br> get-metadata.php l. 210");
  219. }
  220. }
  221. $num_filter_conditions += count($unitSets);
  222. updateSentenceFilterTable();
  223. }
  224. if (!$cache_results) {
  225. $sql = "SELECT id from $table_identifier WHERE TRUE
  226. $query_id_where;";
  227. $result = mysql_query($sql) or die("Error getting sentence IDs for
  228. metadata <br>
  229. ".mysql_error()."
  230. <br> on query
  231. <br>".$sql);
  232. $sentenceIDs = array();
  233. while ($row = mysql_fetch_assoc($result)) {
  234. array_push($sentenceIDs, $row["id"]);
  235. }
  236. $t3 = time();
  237. if ($timing) {
  238. echo "Time to get ".count($sentenceIDs)." sentence ids for
  239. metadata: ".($t3 - $t2)."s<br><br>";
  240. }
  241. return $sentenceIDs;
  242. } else {
  243. return array();
  244. }
  245. } else{
  246. return "all";
  247. }
  248. }
  249. function updateSentenceFilterTable() {
  250. global $num_filter_conditions;
  251. global $num_search_conditions;
  252. global $cache_results;
  253. global $query_id;
  254. $table_identifier = 'filtered_sent_ids';
  255. $query_id_where = '';
  256. if ($cache_results) {
  257. $table_identifier = 'cached_filtered_sent_ids';
  258. $query_id_where = " AND query_id = $query_id ";
  259. }
  260. $sql = "DELETE from $table_identifier
  261. WHERE (num_matched < $num_filter_conditions
  262. OR num_searches_matched < $num_search_conditions)
  263. $query_id_where;";
  264. mysql_query($sql) or die(mysql_error()." on <br> $sql
  265. <br> while clearing temporary filtered sentence id table,
  266. <br>get-metadata.php .l 177");
  267. }
  268. function updateTemporarySentenceFilterTable() {
  269. global $num_filter_conditions;
  270. global $num_search_conditions;
  271. $sql = "DELETE from filtered_sent_ids
  272. WHERE (num_matched < $num_filter_conditions
  273. OR num_searches_matched < $num_search_conditions);";
  274. mysql_query($sql) or die(mysql_error()." on <br> $sql
  275. <br> while clearing temporary filtered sentence id table,
  276. <br>get-metadata.php .l 680");
  277. }
  278. function displayFilterStatistics() {
  279. global $num_filter_conditions;
  280. global $num_search_conditions;
  281. global $cache_results;
  282. global $query_id;
  283. global $dont_cache_search_results;
  284. $table_identifier = 'filtered_sent_ids';
  285. $query_id_where = '';
  286. if (($cache_results || $query_id) && !$dont_cache_search_results) {
  287. echo "<br>Using cache: true<br>";
  288. $table_identifier = 'cached_filtered_sent_ids';
  289. $query_id_where = " AND query_id = $query_id ";
  290. }
  291. $sql = "SELECT * from $table_identifier WHERE TRUE $query_id_where;";
  292. $result = mysql_query($sql);
  293. echo "<br> num_filter_conditions = $num_filter_conditions
  294. <br> num_search_conditions = $num_search_conditions
  295. <br>Number of filtered sentences: ".mysql_num_rows($result);
  296. }
  297. function metadataIsEmpty($metadata){
  298. $all = false;
  299. if(!$metadata){
  300. $all = true;
  301. } else if(json_encode($metadata) == "{}") {
  302. if(count(array_keys($metadata)) == 0){
  303. $all = true;
  304. }
  305. }
  306. return $all;
  307. }
  308. /** Returns the unit ID's that satisfy all the given metadata filters
  309. Consults the document_id GET parameter to see whether the search should be
  310. restricted to the given document ID.
  311. */
  312. function getUnitsForMetadata($metadata){
  313. $timing = getGetParam('timing');
  314. global $query_id;
  315. global $cache_results;
  316. $document_id = getGetParam('document_id');
  317. $document_where_clause = "";
  318. if ($document_id) {
  319. $document_where_clause = " AND document_id = $document_id ";
  320. }
  321. $t1 = time();
  322. $all = metadataIsEmpty($metadata);
  323. $unitSets = "all";
  324. if (!$query_id || $cache_results) {
  325. if(!$all) {
  326. $unitSets = array();
  327. $andConditions = array();
  328. foreach(array_keys($metadata) as $identifier){
  329. $components = explode("_", $identifier);
  330. $type = $components[0];
  331. $length = count($components);
  332. $name = array_slice($components, 1);
  333. $property = mysql_escape_string(join("_", $name));
  334. if($type == "string") {
  335. $values = $metadata[$identifier];
  336. foreach($values as $val){
  337. $components = explode("__", mysql_escape_string($val));
  338. $value = $components[0];
  339. if (strstr($property, "_set")) {
  340. $value = $components[1];
  341. }
  342. $q = " (property_name = '$property' AND value = '$value')";
  343. array_push($andConditions, $q);
  344. }
  345. } else {
  346. $ranges = $metadata[$identifier];
  347. if ($timing != 0) {
  348. echo $identifier."<br>";
  349. echo json_encode($ranges)."<br>";
  350. }
  351. $queries = array();
  352. $property = mysql_escape_string($property);
  353. forEach($ranges as $range){
  354. $start = $range[0];
  355. $end = $range[1];
  356. $q = " (property_name = '$property' AND
  357. value*1 <= $end
  358. AND value*1 >= $start ) ";
  359. if (strstr($start, ' ') || strstr($end, ' ')) {
  360. $q = " (property_name = '$property' AND
  361. value <= '$end'
  362. AND value >= '$start' ) ";
  363. }
  364. array_push($queries, $q);
  365. }
  366. $q = join(" OR ", $queries);
  367. array_push($andConditions, "($q)");
  368. }
  369. }
  370. $sql = "SELECT unit_id from metadata
  371. WHERE
  372. ";
  373. foreach($andConditions as $condition){
  374. $unitSet = array();
  375. $q = $sql.$condition.$document_where_clause;
  376. $result = mysql_query($q.";") or die(mysql_error()."
  377. <br> on query
  378. <br> $q
  379. <br> at get-metadata.php line 237.");
  380. if ($timing) {
  381. echo "<br>
  382. $q
  383. <br>------------<br>";
  384. }
  385. while($row = mysql_fetch_assoc($result)){
  386. array_push($unitSet, $row["unit_id"]);
  387. }
  388. array_push($unitSets, $unitSet);
  389. }
  390. } else if ($document_id) {
  391. $unitSets = array();
  392. $sql = "SELECT unit_id from document_structure
  393. WHERE document_id = $document_id;";
  394. $result = mysql_query($sql.";") or die(mysql_error()."
  395. <br> on query:
  396. <br> $sql
  397. <br> While getting metadata for document id:'$document_id'
  398. <br> at get-metadata.php line 259.");
  399. if ($timing) {
  400. echo "<br>
  401. $sql
  402. <br>------------<br>";
  403. }
  404. $unitSet = array();
  405. while($row = mysql_fetch_assoc($result)){
  406. array_push($unitSet, $row["unit_id"]);
  407. }
  408. array_push($unitSets, $unitSet);
  409. } else {
  410. return "all";
  411. }
  412. $t2 = time();
  413. if ($timing) {
  414. echo "Time to get units for metadata: ".($t2 - $t1)."s<br><br>";
  415. }
  416. } else {
  417. $all = $all && !$document_id;
  418. if ($all) {
  419. return "all";
  420. } else {
  421. return array();
  422. }
  423. }
  424. return $unitSets;
  425. }
  426. /** Climbs the document structure tree up and down to get all the sub-units
  427. and parent units associated with this unit**/
  428. function getAllAssociatedUnits($units){
  429. $sentence_ids = getSentenceIDsForUnits($units);
  430. $associated_units = getUnitsFromSentenceIDs($sentence_ids);
  431. return $associated_units;
  432. }
  433. /** Get a list of sentence ids from a list of unit IDs**/
  434. function getSentenceIDsForUnits($units){
  435. global $timing;
  436. $sentenceIDs = array();
  437. $t1 = time();
  438. $where_clause = makeUnitsWhereClause($units);
  439. $sql = "SELECT DISTINCT sentence_id from sentence_xref_unit
  440. WHERE $where_clause;";
  441. $result = mysql_query($sql) or die ("Error getting sentences from unitSet
  442. <br>".mysql_error()."
  443. <br>$sql<br>");
  444. while ($row = mysql_fetch_assoc($result)) {
  445. array_push($sentenceIDs, $row['sentence_id']);
  446. }
  447. $t2 = time();
  448. if ($timing != 0) {
  449. echo "<br> Time to get sentence ID's for unit set: ".($t2 - $t1)."<br>";
  450. }
  451. return $sentenceIDs;
  452. }
  453. function makeUnitsWhereClause($units) {
  454. array_push($units, -1);
  455. return " unit_id IN (".join(", ", $units).") ";
  456. }
  457. function makeIdWhereClause($units) {
  458. array_push($units, -1);
  459. return " id IN (".join(", ", $units).") ";
  460. }
  461. /** Get a list of unit_id's from a list of sentence IDs.
  462. This function traces the document tree up from individual sentences to the
  463. document and returns all the unit-name unit-id pairs found along the way.
  464. **/
  465. function getUnitsFromSentenceIDs($sentence_ids){
  466. $timing = getGetParam('timing');
  467. $t1 = time();
  468. $sentence_id_string = join(", ", $sentence_ids);
  469. $sql = "SELECT distinct unit_id from sentence_xref_unit
  470. WHERE sentence_id in ($sentence_id_string);";
  471. $result = mysql_query($sql) or die ("Error getting associated units
  472. <br>".mysql_error()."<br>$sql<br>");
  473. $units = array();
  474. while ($row = mysql_fetch_assoc($result)) {
  475. array_push($units, $row["unit_id"]);
  476. }
  477. $t2 = time();
  478. if ($timing) {
  479. echo "<br>Time to get units from sentenceIDs: ".($t2 - $t1)."s
  480. <br>";
  481. }
  482. return $units;
  483. }
  484. /** Given a sentence ID, returns an associative array
  485. containing the document title **/
  486. function getMetadata($sentenceID){
  487. $query="SELECT document_id, title, sentence
  488. FROM document, sentence
  489. WHERE document.id = sentence.document_id
  490. AND sentence.id = $sentenceID;";
  491. $result = mysql_query($query);
  492. $row = mysql_fetch_assoc($result);
  493. return $row;
  494. }
  495. function getMetadataTreeFromSentenceIDs($sentenceIDs, $all){
  496. global $main_metadata_counts_table;
  497. $timing = getGetParam('timing');
  498. $t1 = time();
  499. $metadata = array();
  500. $sql = "";
  501. if ($all) {
  502. if (!table_exists($main_metadata_counts_table)) {
  503. // clear older tables from previous code versions.
  504. for ($i = 0; $i < $main_metadata_counts_table_index; $i++) {
  505. $sql = "DROP TABLE IF EXISTS `main_metadata_counts_$i`";
  506. $result = mysql_query($sql) or die(mysql_error()."<br>
  507. on query<br>
  508. $sql
  509. <br> made on get-metadata.php line 512");
  510. }
  511. $sql = "CREATE TABLE IF NOT EXISTS `$main_metadata_counts_table` (
  512. `property_name` varchar(100) NOT NULL DEFAULT '',
  513. `value` varchar(200) NOT NULL DEFAULT '',
  514. `count` int NOT NULL DEFAULT '0',
  515. `document_count` int NOT NULL DEFAULT '0',
  516. PRIMARY KEY `property_name` (`property_name`, `value`)
  517. ) ENGINE = MyISAM DEFAULT CHARSET = utf8";
  518. $result = mysql_query($sql) or die(mysql_error()."<br>
  519. on query<br>
  520. $sql
  521. <br> made on get-metadata.php line 457");
  522. // Count the sentence-level metadata
  523. $sql = "INSERT IGNORE INTO $main_metadata_counts_table
  524. (property_name, value, count, document_count)
  525. SELECT m.property_name as name,
  526. m.value as value,
  527. count(m.value),
  528. count(distinct m.document_id)
  529. FROM metadata_structure as ms, metadata as m,
  530. sentence_xref_unit as s
  531. WHERE
  532. ms.property_name = m.property_name
  533. AND ms.is_category = 1
  534. AND s.unit_id = m.unit_id
  535. GROUP BY m.property_id, m.value
  536. ORDER BY m.property_name ASC, m.value ASC;";
  537. $result = mysql_query($sql) or die(mysql_error()."<br>
  538. on query<br>
  539. $sql
  540. <br> made on get-metadata.php line 572");
  541. // Count the metadata belonging to other units above the sentence
  542. // level.
  543. $sql = "INSERT IGNORE INTO $main_metadata_counts_table
  544. (property_name, value, count, document_count)
  545. SELECT m.property_name as name,
  546. m.value as value,
  547. count(m.value),
  548. count(distinct m.document_id)
  549. FROM metadata_structure as ms, metadata as m
  550. WHERE
  551. ms.property_name = m.property_name
  552. AND ms.is_category = 1
  553. AND m.unit_name = 'sentence'
  554. GROUP BY m.property_id, m.value
  555. ORDER BY m.property_name ASC, m.value ASC;";
  556. $result = mysql_query($sql) or die(mysql_error()."<br>
  557. on query<br>
  558. $sql
  559. <br> made on get-metadata.php line 544");
  560. }
  561. $metadata_structure = loadMetadataStructure();
  562. $sql = "SELECT * from $main_metadata_counts_table;";
  563. $result = mysql_query($sql) or die(mysql_error()."<br>
  564. on query<br>
  565. $sql
  566. <br> made on get-metadata.php line 481");
  567. while ($row = mysql_fetch_assoc($result)) {
  568. $name = $row['property_name'];
  569. $value = $row['value'];
  570. $id = $metadata_structure[$name]['property_id'];
  571. $count = $row['count'];
  572. $document_count = $row['document_count'];
  573. $type = $metadata_structure[$name]['type'];
  574. if(!array_key_exists($name, $metadata)){
  575. $metadata[$name] = array("children"=>array(),
  576. "count"=>0,
  577. "document_count"=>0,
  578. "sentence_count"=>0,
  579. "text"=>$name,
  580. "propertyName"=>$name,
  581. "displayName"=> $metadata_structure[$name][
  582. 'name_to_display'],
  583. "property_id"=>$id,
  584. "type"=>$type);
  585. }
  586. array_push($metadata[$name]["children"],
  587. array("text"=>$value,
  588. "count"=>$count,
  589. "document_count"=>$document_count,
  590. "propertyName"=>$name,
  591. "value"=>$value));
  592. $metadata[$name]["count"] += $count;
  593. $metadata[$name]["document_count"] += $document_count;
  594. }
  595. } else {
  596. $metadata_structure = loadMetadataStructure();
  597. $info = getMetadataInformationForSentences($all);
  598. $sentences = array();
  599. foreach ($info['sentences'] as $row) {
  600. foreach (array_keys($row) as $identifier) {
  601. if ($identifier != "sentence_id" &&
  602. $identifier != "document_id" &&
  603. $identifier != "words" &&
  604. $identifier != "unit_id") {
  605. $components = explode("__", $identifier);
  606. $name = $components[1];
  607. $value = $row[$identifier];
  608. $id = $metadata_structure[$name]['property_id'];
  609. $type = $components[0];
  610. if(!array_key_exists($name, $metadata)){
  611. $sentences[$name] = array();
  612. $documents[$name] = array();
  613. $metadata[$name] = array("children"=>array(),
  614. "count"=>0,
  615. "document_count"=>0,
  616. "text"=>$name,
  617. "propertyName"=>$name,
  618. "displayName"=>$metadata_structure[$name][
  619. "name_to_display"],
  620. "property_id"=>$id,
  621. "type"=>$type);
  622. }
  623. if(!array_key_exists($value, $metadata[$name]["children"])){
  624. $metadata[$name]["children"][$value] = array("count"=>0,
  625. "document_count"=>0);
  626. $sentences[$name][$value] = array();
  627. $documents[$name][$value] = array();
  628. }
  629. if (!array_key_exists($row['sentence_id'],
  630. $sentences[$name][$value])) {
  631. $sentences[$name][$value][$row['sentence_id']] = 1;
  632. $leaf = $metadata[$name]["children"][$value]
  633. ["count"] += 1;
  634. $metadata[$name]["count"] += 1;
  635. if (!array_key_exists($row['document_id'],
  636. $documents[$name][$value])) {
  637. $documents[$name][$value][$row['document_id']] = 1;
  638. $metadata[$name]["document_count"] += 1;
  639. $metadata[$name]["children"][$value]
  640. ['document_count'] += 1;
  641. }
  642. }
  643. }
  644. }
  645. }
  646. foreach(array_keys($metadata) as $name){
  647. $children = $metadata[$name]["children"];
  648. $leaves = array();
  649. foreach(array_keys($children) as $value){
  650. array_push($leaves, array(
  651. "count"=>$children[$value]['count'],
  652. "document_count"=>$children[$value]['document_count'],
  653. "propertyName"=>$name,
  654. "text"=>$value,
  655. "value"=>$value));
  656. }
  657. $metadata[$name]["children"] = $leaves;
  658. }
  659. }
  660. $t2 = time();
  661. if ($timing) {
  662. echo "<br>
  663. Time to get metadata tree for sentence ids: ".($t2-$t1)."s
  664. <br>";
  665. }
  666. //Add in the names of the working sets belonging to this user and remove
  667. // all others.
  668. $user = getGetParam('user');
  669. if ($user) {
  670. $sql = "SELECT * from working_set where username = '$user';";
  671. $result = mysql_query($sql) or die(mysql_error()."<br>
  672. on query<br>
  673. $sql
  674. <br> made on get-metadata.php line 663");
  675. $set_names = array();
  676. while ($row = mysql_fetch_array($result)) {
  677. $set_names[$row['id']] = $row['name'];
  678. }
  679. $keys = array("document_set", "sentence_set", "word_set", "");
  680. foreach($keys as $name) {
  681. if ($metadata[$name]) {
  682. $metadata[$name]["count"] = 0;
  683. $children = $metadata[$name]["children"];
  684. if (count($children) > 0) {
  685. $renamed = array();
  686. foreach ($children as $child) {
  687. $set_name = $set_names[$child["value"]];
  688. if ($set_name) {
  689. $child["text"] = $set_name;
  690. array_push($renamed, $child);
  691. $metadata[$name]["count"] += $child["count"];
  692. }
  693. }
  694. if (count($renamed) > 0) {
  695. $metadata[$name]["children"] = $renamed;
  696. } else {
  697. unset($metadata[$name]);
  698. }
  699. } else {
  700. unset($metadata[$name]);
  701. }
  702. if ($metadata[$name]["count"] == 0) {
  703. unset($metadata[$name]);
  704. }
  705. }
  706. }
  707. } else {
  708. unset($metadata['document_set']);
  709. unset($metadata['sentence_set']);
  710. unset($metadata['word_set']);
  711. }
  712. return $metadata;
  713. }
  714. /**
  715. * Return format:
  716. * { docID1: { prop1:value1, prop2:value2, ...},
  717. */
  718. function getMetadataInformationForDocuments($documentIDs, $all) {
  719. $metadata = array();
  720. $sql = "SELECT `metadata`.`document_id`,
  721. `metadata`.`property_name`,
  722. `metadata`.`value`,
  723. `metadata`.`property_id`
  724. FROM `metadata_structure`, `metadata`
  725. WHERE `metadata`.`property_id` = `metadata_structure`.`property_id`
  726. AND `metadata_structure`.`is_category` != 0
  727. AND `metadata_structure`.`value_is_displayed` != 0
  728. AND `metadata_structure`.`unit_name`='document';";
  729. if (count($documentIDs) > 0) {
  730. $docIDsStr = join(', ', $documentIDs);
  731. $sql .= "AND `metadata`.`document_id` IN ($docIDsStr)";
  732. }
  733. $sql .= "ORDER BY `metadata`.`document_id`, `metadata`.`property_id`";
  734. if ($all || $documentIDs == 'all' || count($documentIDs) > 0) {
  735. $result = mysql_query($sql);
  736. while ($row = mysql_fetch_assoc($result)) {
  737. $metadata[$row['document_id']][$row['property_name']] = $row['value'];
  738. }
  739. }
  740. return $metadata;
  741. }
  742. /** Return format:
  743. {
  744. sentences: Array[{sentence_id:id, prop1:value1, ... , propN: valueN}, ..., ]
  745. documents: Array[{document_id:id, prop1: value1, ...., propM: valueM}]
  746. one for each sentence ID and document ID.
  747. */
  748. function getMetadataInformationForSentences($all) {
  749. global $timing;
  750. $info = array();
  751. $sentence_info = array();
  752. $document_info = array();
  753. $sentences = array();
  754. $documents = array();
  755. $metadata_structure = loadMetadataStructure();
  756. $table_identifier = 'filtered_sent_ids';
  757. $query_id_where = '';
  758. global $cache_results;
  759. global $query_id;
  760. global $dont_cache_search_results;
  761. if ($cache_results || $query_id && !$dont_cache_search_results) {
  762. $table_identifier = 'cached_filtered_sent_ids';
  763. $query_id_where = " AND query_id = $query_id ";
  764. }
  765. if ($all) {
  766. } else {
  767. resetUnitsFilter();
  768. $sql = " INSERT IGNORE INTO filtered_unit_ids
  769. (sentence_id, unit_id, document_id)
  770. SELECT sentence_id, unit_id, document_id
  771. from $table_identifier , sentence_xref_unit
  772. WHERE sentence_id = id
  773. $query_id_where;";
  774. $result = mysql_query($sql) or die ("<br> Error inserting unit ids's
  775. into filtered_unit_ids
  776. <br>".mysql_error()."
  777. <br> on query:
  778. <br> $sql
  779. <br> at get-metadata.php l. 538.");
  780. $sql = " INSERT IGNORE INTO filtered_unit_ids
  781. (sentence_id, unit_id, document_id)
  782. SELECT id, id, document_id from $table_identifier
  783. WHERE TRUE $query_id_where;";
  784. $result = mysql_query($sql) or die ("<br> Error inserting unit ids's
  785. into filtered_unit_ids
  786. <br>".mysql_error()."
  787. <br> on query:
  788. <br> $sql
  789. <br> at get-metadata.php l. 538.");
  790. $sql = "SELECT m.property_name as name,
  791. m.value as value,
  792. m.unit_id as unit_id,
  793. u.sentence_id as sentence_id,
  794. u.document_id as document_id
  795. FROM filtered_unit_ids as u, metadata as m
  796. WHERE m.unit_id = u.unit_id
  797. ORDER BY sentence_id;";
  798. $result = mysql_query($sql) or die(mysql_error()."<br>
  799. on query<br>
  800. $sql
  801. <br> made on get-metadata.php line 545");
  802. if ($timing) {
  803. $sql = "SELECT count(*) as count from filtered_unit_ids;";
  804. $res = mysql_query($sql);
  805. $row = mysql_fetch_assoc($res);
  806. echo "number of matched units: ".$row['count']."<br>";
  807. }
  808. while ($row = mysql_fetch_array($result)) {
  809. $sentence_id = $row['sentence_id'];
  810. $document_id = $row['document_id'];
  811. $unit_id = $row['unit_id'];
  812. if (!array_key_exists($sentence_id, $sentence_info)) {
  813. $sentence_info[$sentence_id] = array(
  814. 'sentence_id'=>array($sentence_id),
  815. 'document_id'=>array($document_id)
  816. );
  817. }
  818. $metadata_info = $metadata_structure[$row['name']];
  819. if ($metadata_info['is_category'] == 1
  820. || $metadata_info['value_is_displayed'] == 1) {
  821. $property_name_identifier = ($metadata_info['type']."__".
  822. $metadata_info['property_name']);
  823. if (!array_key_exists($property_name_identifier,
  824. $sentence_info[$sentence_id])) {
  825. $sentence_info[$sentence_id][$property_name_identifier] =
  826. array();
  827. }
  828. array_push($sentence_info[$sentence_id][$property_name_identifier],
  829. $row['value']);
  830. }
  831. }
  832. }
  833. $sentence_ids = array_keys($sentence_info);
  834. sort($sentence_ids);
  835. foreach($sentence_ids as $sentence_id) {
  836. $info = $sentence_info[$sentence_id];
  837. $properties = array();
  838. foreach(array_keys($info) as $property) {
  839. $components = explode("__", $property);
  840. $name = $components[1];
  841. if ($metadata_structure[$name]['is_category'] == 1
  842. || count($components) < 2) {
  843. array_push($properties, $property);
  844. }
  845. }
  846. $variants = makeAllVariants($info, $properties);
  847. foreach ($variants as $v) {
  848. array_push($sentences, $v);
  849. }
  850. }
  851. $info['sentences'] = $sentences;
  852. $info['documents'] = $documents;
  853. return $info;
  854. }
  855. /** If some sentences have multiple values for a metadata attribute, make
  856. replicas of them, one for each value, so that each combination of attributes
  857. is represented. Only do this for metadata for which is_category is true.
  858. */
  859. function makeAllVariants($info, $properties) {
  860. if (count($properties) == 0) {
  861. return array();
  862. } else if (count($properties) == 1) {
  863. return makeVariants($properties[0], $info[$properties[0]], array());
  864. } else {
  865. $other_variants = makeAllVariants($info, array_slice($properties, 1));
  866. $my_variants = array();
  867. foreach ($other_variants as $variant) {
  868. $variations = makeVariants($properties[0], $info[$properties[0]],
  869. $variant);
  870. foreach ($variations as $v) {
  871. array_push($my_variants, $v);
  872. }
  873. }
  874. return $my_variants;
  875. }
  876. }
  877. function makeVariants($property_name, $values, $input) {
  878. $variants = array();
  879. foreach ($values as $value) {
  880. $variant = array();
  881. foreach(array_keys($input) as $prop) {
  882. $variant[$prop] = $input[$prop];
  883. }
  884. $variant[$property_name] = $value;
  885. array_push($variants, $variant);
  886. }
  887. return $variants;
  888. }
  889. /** Return format
  890. {
  891. propertyName: { propertyID:< >, type: < >},
  892. }
  893. */
  894. function loadMetadataStructure() {
  895. $info = array();
  896. $sql = "SELECT * from metadata_structure;";
  897. $result = mysql_query($sql);
  898. while ($row = mysql_fetch_assoc($result)) {
  899. $info[$row['property_name']] = $row;
  900. }
  901. return $info;
  902. }
  903. /** Returns the sentence id's that match the given filters. Also consults the
  904. GET param "document_id".
  905. */
  906. function getSentenceIDsForFilters($metadata, $collection, $phrases) {
  907. global $timing;
  908. resetSentenceFilters();
  909. include_once "../phrases/get-phrases.php";
  910. if ($timing != 0) {
  911. echo "<br>----- Getting sentence ID's that match filters -------<br>";
  912. }
  913. $sentence_ids = array();
  914. // Apply the metadata and collection filters.
  915. $filtered_by_metadata_and_collection =
  916. getSentenceIDsForMetadataAndCollection($metadata, $collection);
  917. $metadata_and_collection_filter_active =
  918. ($filtered_by_metadata_and_collection != 'all');
  919. // Apply the phrases filter.
  920. $filtered_by_phrases = getSentenceIDsForPhrases($phrases);
  921. $phrases_filter_active = ($filtered_by_phrases != 'all');
  922. if ($metadata_and_collection_filter_active && $phrases_filter_active) {
  923. $sentence_ids = array_intersect($filtered_by_metadata_and_collection,
  924. $filtered_by_phrases);
  925. } else if ($metadata_and_collection_filter_active) {
  926. $sentence_ids = $filtered_by_metadata_and_collection;
  927. } else if ($phrases_filter_active) {
  928. $sentence_ids = $filtered_by_phrases;
  929. } else {
  930. $sentence_ids = "all";
  931. }
  932. if ($timing) {
  933. echo "<br> Number of sentence ID's that match all filters: ";
  934. if ($sentence_ids != "all") {
  935. echo count($sentence_ids)."<br>";
  936. } else {
  937. echo "all<br>";
  938. }
  939. echo "<br>Finished getting sentence ID's that match filters <hr>";
  940. }
  941. return $sentence_ids;
  942. }
  943. function metadata_dispatch() {
  944. include_once "../util.php";
  945. include_once "../grammaticalsearch/get-search-results.php";
  946. //Query parameters
  947. $gov = getGetParam('gov');
  948. $govtype = getGetParam('govtype');
  949. $dep = getGetParam('dep');
  950. $deptype = getGetParam('deptype');
  951. $relation = getGetParam('relation');
  952. $collection = getGetParam('collection');
  953. $statistics = getGetParam('statistics');
  954. $phrasess = decodeGetJson('phrases');
  955. $metadata = decodeGetJson('metadata');
  956. $timing = getGetParam('timing');
  957. if($relation == ""){
  958. if(!$statistics){
  959. $results = getSearchResults($gov, $govtype, $collection,
  960. $metadata, $phrases);
  961. }else{
  962. $results = array('statistics'=>array(), 'sentences'=>array());
  963. }
  964. }else{
  965. $results = getDependencySearchResults($gov, $govtype, $dep,
  966. $deptype, $relation, $collection, $statistics, $metadata, $phrases);
  967. }
  968. $metadata_results = array();
  969. foreach (array_keys($results['metadata']) as $property) {
  970. $children = $results['metadata'][$property]['children'];
  971. foreach ($children as $child) {
  972. array_push($metadata_results, $child);
  973. }
  974. }
  975. echo json_encode($metadata_results);
  976. }
  977. ?>