PageRenderTime 42ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/rdf/sites/all/modules/rdf/vendor/arc/parsers/ARC2_CBJSONParser.php

https://gitlab.com/klausmig/CloudSemanticWeb
PHP | 267 lines | 209 code | 30 blank | 28 comment | 53 complexity | d3b40934f7769717de98c2971cd4f8a2 MD5 | raw file
  1. <?php
  2. /**
  3. * ARC2 CrunchBase API JSON Parser
  4. *
  5. * @author Benjamin Nowack <bnowack@semsol.com>
  6. * @license http://arc.semsol.org/license
  7. * @homepage <http://arc.semsol.org/>
  8. * @package ARC2
  9. * @version 2010-11-16
  10. */
  11. ARC2::inc('JSONParser');
  12. class ARC2_CBJSONParser extends ARC2_JSONParser {
  13. function __construct($a, &$caller) {
  14. parent::__construct($a, $caller);
  15. }
  16. function __init() {/* reader */
  17. parent::__init();
  18. $this->base = 'http://cb.semsol.org/';
  19. $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
  20. $this->default_ns = $this->base . 'ns#';
  21. $this->nsp = array($this->rdf => 'rdf');
  22. }
  23. /* */
  24. function done() {
  25. $this->extractRDF();
  26. }
  27. function extractRDF() {
  28. $struct = $this->struct;
  29. if ($type = $this->getStructType($struct)) {
  30. $s = $this->getResourceID($struct, $type);
  31. /* rdf:type */
  32. $this->addT($s, $this->rdf . 'type', $this->default_ns . $this->camelCase($type), 'uri', 'uri');
  33. /* explicit triples */
  34. $this->extractResourceRDF($struct, $s);
  35. }
  36. }
  37. function getStructType($struct, $rel = '') {
  38. /* url-based */
  39. if ($url = $this->v('crunchbase_url', '', $struct)) {
  40. return preg_replace('/^.*crunchbase\.com\/([^\/]+)\/.*$/', '\\1', $url);
  41. }
  42. /* rel-based */
  43. if ($rel == 'person') return 'person';
  44. if ($rel == 'company') return 'company';
  45. if ($rel == 'acquiring_company') return 'company';
  46. if ($rel == 'firm') return 'company';
  47. if ($rel == 'provider') return 'service-provider';
  48. /* struct-based */
  49. if (isset($struct['_type'])) return $struct['_type'];
  50. if (isset($struct['round_code'])) return 'funding_round';
  51. if (isset($struct['products'])) return 'company';
  52. if (isset($struct['first_name'])) return 'person';
  53. if (isset($struct['investments'])) return 'financial-organization';
  54. if (isset($struct['launched_year'])) return 'product';
  55. if (isset($struct['providerships']) && is_array($struct['providerships'])) return 'service-provider';
  56. return '';
  57. }
  58. function getResourceID($struct, $type) {
  59. if ($type && isset($struct['permalink'])) {
  60. return $this->base . $type . '/' . $struct['permalink'] . '#self';
  61. }
  62. return $this->createBnodeID();
  63. }
  64. function getPropertyURI($name, $ns = '') {
  65. if (!$ns) $ns = $this->default_ns;
  66. if (preg_match('/^(product|funding_round|investment|acquisition|.+ship|office|milestone|.+embed|.+link|degree|fund)s/', $name, $m)) $name = $m[1];
  67. if ($name == 'tag_list') $name = 'tag';
  68. if ($name == 'competitions') $name = 'competitor';
  69. return $ns . $name;
  70. }
  71. function createSubURI($s, $k, $pos) {
  72. $s = str_replace('#self', '/', $s);
  73. if (preg_match('/(office|ship|investment|milestone|fund|embed|link)s$/', $k)) $k = substr($k, 0, -1);
  74. return $s . $k . '-' . ($pos + 1) . '#self';
  75. }
  76. /* */
  77. function extractResourceRDF($struct, $s, $pos = 0) {
  78. $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri';
  79. $date_prefixes = array();
  80. foreach ($struct as $k => $v) {
  81. if ($k == 'acquisition') $k = 'exit';
  82. if (preg_match('/^(.*)\_(year|month|day)$/', $k, $m)) {
  83. if (!in_array($m[1], $date_prefixes)) $date_prefixes[] = $m[1];
  84. }
  85. $sub_m = 'extract' . $this->camelCase($k) . 'RDF';
  86. if (method_exists($this, $sub_m)) {
  87. $this->$sub_m($s, $s_type, $v);
  88. continue;
  89. }
  90. $p = $this->getPropertyURI($k);
  91. if (!$v) continue;
  92. /* simple, single v */
  93. if (!is_array($v)) {
  94. $o_type = preg_match('/^[a-z]+\:[^\s]+$/is', $v) ? 'uri' : 'literal';
  95. $v = trim($v);
  96. if (preg_match('/^https?\:\/\/[^\/]+$/', $v)) $v .= '/';
  97. $this->addT($s, $p, $v, $s_type, $o_type);
  98. /* rdfs:label */
  99. if ($k == 'name') $this->addT($s, 'http://www.w3.org/2000/01/rdf-schema#label', $v, $s_type, $o_type);
  100. /* dc:identifier */
  101. //if ($k == 'permalink') $this->addT($s, 'http://purl.org/dc/elements/1.1/identifier', $v, $s_type, $o_type);
  102. }
  103. /* structured, single v */
  104. elseif (!$this->isFlatArray($v)) {
  105. if ($o_type = $this->getStructType($v, $k)) {/* known type */
  106. $o = $this->getResourceID($v, $o_type);
  107. $this->addT($s, $p, $o, $s_type, 'uri');
  108. $this->addT($o, $this->rdf . 'type', $this->default_ns . $this->camelCase($o_type), 'uri', 'uri');
  109. }
  110. else {/* unknown type */
  111. $o = $this->createSubURI($s, $k, $pos);
  112. $this->addT($s, $p, $o, $s_type, 'uri');
  113. $this->extractResourceRDF($v, $o);
  114. }
  115. }
  116. /* value list */
  117. else {
  118. foreach ($v as $sub_pos => $sub_v) {
  119. $this->extractResourceRDF(array($k => $sub_v), $s, $sub_pos);
  120. }
  121. }
  122. }
  123. /* infer XSD triples */
  124. foreach ($date_prefixes as $prefix) {
  125. $this->inferDate($prefix, $s, $struct);
  126. }
  127. }
  128. function isFlatArray($v) {
  129. foreach ($v as $k => $sub_v) {
  130. return is_numeric($k) ? 1 : 0;
  131. }
  132. }
  133. /* */
  134. function extractTagListRDF($s, $s_type, $v) {
  135. if (!$v) return 0;
  136. $tags = preg_split('/\, /', $v);
  137. foreach ($tags as $tag) {
  138. if (!trim($tag)) continue;
  139. $this->addT($s, $this->getPropertyURI('tag'), $tag, $s_type, 'literal');
  140. }
  141. }
  142. function extractImageRDF($s, $s_type, $v, $rel = 'image') {
  143. if (!$v) return 1;
  144. $sizes = $v['available_sizes'];
  145. foreach ($sizes as $size) {
  146. $w = $size[0][0];
  147. $h = $size[0][1];
  148. $img = 'http://www.crunchbase.com/' . $size[1];
  149. $this->addT($s, $this->getPropertyURI($rel), $img, $s_type, 'uri');
  150. $this->addT($img, $this->getPropertyURI('width'), $w, 'uri', 'literal');
  151. $this->addT($img, $this->getPropertyURI('height'), $h, 'uri', 'literal');
  152. }
  153. }
  154. function extractScreenshotsRDF($s, $s_type, $v) {
  155. if (!$v) return 1;
  156. foreach ($v as $sub_v) {
  157. $this->extractImageRDF($s, $s_type, $sub_v, 'screenshot');
  158. }
  159. }
  160. function extractProductsRDF($s, $s_type, $v) {
  161. foreach ($v as $sub_v) {
  162. $o = $this->getResourceID($sub_v, 'product');
  163. $this->addT($s, $this->getPropertyURI('product'), $o, $s_type, 'uri');
  164. }
  165. }
  166. function extractCompetitionsRDF($s, $s_type, $v) {
  167. foreach ($v as $sub_v) {
  168. $o = $this->getResourceID($sub_v['competitor'], 'company');
  169. $this->addT($s, $this->getPropertyURI('competitor'), $o, $s_type, 'uri');
  170. }
  171. }
  172. function extractFundingRoundsRDF($s, $s_type, $v) {
  173. foreach ($v as $pos => $sub_v) {
  174. $o = $this->createSubURI($s, 'funding_round', $pos);
  175. $this->addT($s, $this->getPropertyURI('funding_round'), $o, $s_type, 'uri');
  176. $this->extractResourceRDF($sub_v, $o, $pos);
  177. }
  178. }
  179. function extractInvestmentsRDF($s, $s_type, $v) {
  180. foreach ($v as $pos => $sub_v) {
  181. /* incoming */
  182. foreach (array('person' => 'person', 'company' => 'company', 'financial_org' => 'financial-organization') as $k => $type) {
  183. if (isset($sub_v[$k])) $this->addT($s, $this->getPropertyURI('investment'), $this->getResourceID($sub_v[$k], $type), $s_type, 'uri');
  184. }
  185. /* outgoing */
  186. if (isset($sub_v['funding_round'])) {
  187. $o = $this->createSubURI($s, 'investment', $pos);
  188. $this->addT($s, $this->getPropertyURI('investment'), $o, $s_type, 'uri');
  189. $this->extractResourceRDF($sub_v['funding_round'], $o, $pos);
  190. }
  191. }
  192. }
  193. function extractExternalLinksRDF($s, $s_type, $v) {
  194. foreach ($v as $sub_v) {
  195. $href = $sub_v['external_url'];
  196. if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/';
  197. $this->addT($s, $this->getPropertyURI('external_link'), $href, $s_type, 'uri');
  198. $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal');
  199. }
  200. }
  201. function extractWebPresencesRDF($s, $s_type, $v) {
  202. foreach ($v as $sub_v) {
  203. $href = $sub_v['external_url'];
  204. if (preg_match('/^https?\:\/\/[^\/]+$/', $href)) $href .= '/';
  205. $this->addT($s, $this->getPropertyURI('web_presence'), $href, $s_type, 'uri');
  206. $this->addT($href, $this->getPropertyURI('title'), $sub_v['title'], $s_type, 'literal');
  207. }
  208. }
  209. function extractCreatedAtRDF($s, $s_type, $v) {
  210. $v = $this->getAPIDateXSD($v);
  211. $this->addT($s, $this->getPropertyURI('created_at'), $v, $s_type, 'literal');
  212. }
  213. function extractUpdatedAtRDF($s, $s_type, $v) {
  214. $v = $this->getAPIDateXSD($v);
  215. $this->addT($s, $this->getPropertyURI('updated_at'), $v, $s_type, 'literal');
  216. }
  217. function getAPIDateXSD($val) {
  218. //Fri Jan 16 21:11:48 UTC 2009
  219. if (preg_match('/^[a-z]+ ([a-z]+) ([0-9]+) ([0-9]{2}\:[0-9]{2}\:[0-9]{2}) UTC ([0-9]{4})/i', $val, $m)) {
  220. $months = array('Jan' => '01', 'Feb' => '02', 'Mar' =>'03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12');
  221. return $m[4] . '-' . $months[$m[1]] . '-' . $m[2] . 'T' . $m[3] . 'Z';
  222. }
  223. return '2000-01-01';
  224. }
  225. /* */
  226. function inferDate($prefix, $s, $struct) {
  227. $s_type = preg_match('/^\_\:/', $s) ? 'bnode' : 'uri';
  228. $r = '';
  229. foreach (array('year', 'month', 'day') as $suffix) {
  230. $val = $this->v1($prefix . '_' . $suffix, '00', $struct);
  231. $r .= ($r ? '-' : '') . str_pad($val, 2, '0', STR_PAD_LEFT);
  232. }
  233. if ($r != '00-00-00') {
  234. $this->addT($s, $this->getPropertyURI($prefix . '_date'), $r, $s_type, 'literal');
  235. }
  236. }
  237. }