PageRenderTime 26ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 1ms

/components/dom_distiller/standalone/content_extractor_browsertest.cc

https://gitlab.com/jonnialva90/iridium-browser
C++ | 429 lines | 339 code | 57 blank | 33 comment | 38 complexity | 1f3b4ce3d9b0a078e17c3f65f7b0e2f7 MD5 | raw file
  1. // Copyright 2014 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #include <sstream>
  5. #include "base/command_line.h"
  6. #include "base/files/scoped_temp_dir.h"
  7. #include "base/id_map.h"
  8. #include "base/location.h"
  9. #include "base/path_service.h"
  10. #include "base/run_loop.h"
  11. #include "base/single_thread_task_runner.h"
  12. #include "base/strings/string_number_conversions.h"
  13. #include "base/strings/string_split.h"
  14. #include "base/thread_task_runner_handle.h"
  15. #include "components/dom_distiller/content/browser/distiller_javascript_utils.h"
  16. #include "components/dom_distiller/content/browser/distiller_page_web_contents.h"
  17. #include "components/dom_distiller/core/article_entry.h"
  18. #include "components/dom_distiller/core/distilled_page_prefs.h"
  19. #include "components/dom_distiller/core/distiller.h"
  20. #include "components/dom_distiller/core/dom_distiller_service.h"
  21. #include "components/dom_distiller/core/dom_distiller_store.h"
  22. #include "components/dom_distiller/core/proto/distilled_article.pb.h"
  23. #include "components/dom_distiller/core/proto/distilled_page.pb.h"
  24. #include "components/dom_distiller/core/task_tracker.h"
  25. #include "components/leveldb_proto/proto_database.h"
  26. #include "components/leveldb_proto/proto_database_impl.h"
  27. #include "components/pref_registry/testing_pref_service_syncable.h"
  28. #include "content/public/browser/browser_context.h"
  29. #include "content/public/browser/browser_thread.h"
  30. #include "content/public/common/isolated_world_ids.h"
  31. #include "content/public/test/content_browser_test.h"
  32. #include "content/shell/browser/shell.h"
  33. #include "google/protobuf/io/coded_stream.h"
  34. #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
  35. #include "net/dns/mock_host_resolver.h"
  36. #include "third_party/dom_distiller_js/dom_distiller.pb.h"
  37. #include "ui/base/resource/resource_bundle.h"
  38. using content::ContentBrowserTest;
  39. namespace dom_distiller {
  40. namespace {
  41. typedef base::hash_map<std::string, std::string> FileToUrlMap;
  42. }
  43. // Factory for creating a Distiller that creates different DomDistillerOptions
  44. // for different URLs, i.e. a specific kOriginalUrl option for each URL.
  45. class TestDistillerFactoryImpl : public DistillerFactory {
  46. public:
  47. TestDistillerFactoryImpl(
  48. scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
  49. const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
  50. const FileToUrlMap& file_to_url_map)
  51. : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
  52. dom_distiller_options_(dom_distiller_options),
  53. file_to_url_map_(file_to_url_map) {
  54. }
  55. ~TestDistillerFactoryImpl() override {}
  56. scoped_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
  57. dom_distiller::proto::DomDistillerOptions options;
  58. options = dom_distiller_options_;
  59. FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec());
  60. if (it != file_to_url_map_.end()) {
  61. options.set_original_url(it->second);
  62. }
  63. scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
  64. *distiller_url_fetcher_factory_, options));
  65. return distiller.Pass();
  66. }
  67. private:
  68. scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
  69. dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
  70. FileToUrlMap file_to_url_map_;
  71. };
  72. namespace {
  73. // The url to distill.
  74. const char* kUrlSwitch = "url";
  75. // A space-separated list of urls to distill.
  76. const char* kUrlsSwitch = "urls";
  77. // Indicates that DNS resolution should be disabled for this test.
  78. const char* kDisableDnsSwitch = "disable-dns";
  79. // Will write the distilled output to the given file instead of to stdout.
  80. const char* kOutputFile = "output-file";
  81. // Indicates to output a serialized protocol buffer instead of human-readable
  82. // output.
  83. const char* kShouldOutputBinary = "output-binary";
  84. // Indicates to output only the text of the article and not the enclosing html.
  85. const char* kExtractTextOnly = "extract-text-only";
  86. // Indicates to include debug output.
  87. const char* kDebugLevel = "debug-level";
  88. // The original URL of the page if |kUrlSwitch| is a file.
  89. const char* kOriginalUrl = "original-url";
  90. // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
  91. // "kUrlsSwitch".
  92. const char* kOriginalUrls = "original-urls";
  93. // Maximum number of concurrent started extractor requests.
  94. const int kMaxExtractorTasks = 8;
  95. scoped_ptr<DomDistillerService> CreateDomDistillerService(
  96. content::BrowserContext* context,
  97. const base::FilePath& db_path,
  98. const FileToUrlMap& file_to_url_map) {
  99. scoped_refptr<base::SequencedTaskRunner> background_task_runner =
  100. content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
  101. content::BrowserThread::GetBlockingPool()->GetSequenceToken());
  102. // TODO(cjhopman): use an in-memory database instead of an on-disk one with
  103. // temporary directory.
  104. scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
  105. new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
  106. background_task_runner));
  107. scoped_ptr<DomDistillerStore> dom_distiller_store(
  108. new DomDistillerStore(db.Pass(), db_path));
  109. scoped_ptr<DistillerPageFactory> distiller_page_factory(
  110. new DistillerPageWebContentsFactory(context));
  111. scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
  112. new DistillerURLFetcherFactory(context->GetRequestContext()));
  113. dom_distiller::proto::DomDistillerOptions options;
  114. if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
  115. options.set_extract_text_only(true);
  116. }
  117. int debug_level = 0;
  118. if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
  119. base::StringToInt(
  120. base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
  121. kDebugLevel),
  122. &debug_level)) {
  123. options.set_debug_level(debug_level);
  124. }
  125. scoped_ptr<DistillerFactory> distiller_factory(
  126. new TestDistillerFactoryImpl(distiller_url_fetcher_factory.Pass(),
  127. options,
  128. file_to_url_map));
  129. // Setting up PrefService for DistilledPagePrefs.
  130. user_prefs::TestingPrefServiceSyncable* pref_service =
  131. new user_prefs::TestingPrefServiceSyncable();
  132. DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
  133. return scoped_ptr<DomDistillerService>(new DomDistillerService(
  134. dom_distiller_store.Pass(),
  135. distiller_factory.Pass(),
  136. distiller_page_factory.Pass(),
  137. scoped_ptr<DistilledPagePrefs>(new DistilledPagePrefs(pref_service))));
  138. }
  139. void AddComponentsTestResources() {
  140. base::FilePath pak_file;
  141. base::FilePath pak_dir;
  142. PathService::Get(base::DIR_MODULE, &pak_dir);
  143. pak_file =
  144. pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
  145. ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
  146. pak_file, ui::SCALE_FACTOR_NONE);
  147. }
  148. bool WriteProtobufWithSize(
  149. const google::protobuf::MessageLite& message,
  150. google::protobuf::io::ZeroCopyOutputStream* output_stream) {
  151. google::protobuf::io::CodedOutputStream coded_output(output_stream);
  152. // Write the size.
  153. const int size = message.ByteSize();
  154. coded_output.WriteLittleEndian32(size);
  155. message.SerializeWithCachedSizes(&coded_output);
  156. return !coded_output.HadError();
  157. }
  158. std::string GetReadableArticleString(
  159. const DistilledArticleProto& article_proto) {
  160. std::stringstream output;
  161. output << "Article Title: " << article_proto.title() << std::endl;
  162. output << "# of pages: " << article_proto.pages_size() << std::endl;
  163. for (int i = 0; i < article_proto.pages_size(); ++i) {
  164. if (i > 0) output << std::endl;
  165. const DistilledPageProto& page = article_proto.pages(i);
  166. output << "Page " << i << std::endl;
  167. output << "URL: " << page.url() << std::endl;
  168. output << "Content: " << page.html() << std::endl;
  169. if (page.has_debug_info() && page.debug_info().has_log())
  170. output << "Log: " << page.debug_info().log() << std::endl;
  171. if (page.has_pagination_info()) {
  172. if (page.pagination_info().has_next_page()) {
  173. output << "Next Page: " << page.pagination_info().next_page()
  174. << std::endl;
  175. }
  176. if (page.pagination_info().has_prev_page()) {
  177. output << "Prev Page: " << page.pagination_info().prev_page()
  178. << std::endl;
  179. }
  180. }
  181. }
  182. return output.str();
  183. }
  184. } // namespace
  185. class ContentExtractionRequest : public ViewRequestDelegate {
  186. public:
  187. void Start(DomDistillerService* service, const gfx::Size& render_view_size,
  188. base::Closure finished_callback) {
  189. finished_callback_ = finished_callback;
  190. viewer_handle_ =
  191. service->ViewUrl(this,
  192. service->CreateDefaultDistillerPage(render_view_size),
  193. url_);
  194. }
  195. DistilledArticleProto GetArticleCopy() {
  196. return *article_proto_;
  197. }
  198. static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
  199. const base::CommandLine& command_line,
  200. FileToUrlMap* file_to_url_map) {
  201. ScopedVector<ContentExtractionRequest> requests;
  202. if (command_line.HasSwitch(kUrlSwitch)) {
  203. GURL url;
  204. std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
  205. url = GURL(url_string);
  206. if (url.is_valid()) {
  207. requests.push_back(new ContentExtractionRequest(url));
  208. if (command_line.HasSwitch(kOriginalUrl)) {
  209. (*file_to_url_map)[url.spec()] =
  210. command_line.GetSwitchValueASCII(kOriginalUrl);
  211. }
  212. }
  213. } else if (command_line.HasSwitch(kUrlsSwitch)) {
  214. std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
  215. std::vector<std::string> urls = base::SplitString(
  216. urls_string, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
  217. // Check for original-urls switch, which must exactly pair up with
  218. // |kUrlsSwitch| i.e. number of original urls must be same as that of
  219. // urls.
  220. std::vector<std::string> original_urls;
  221. if (command_line.HasSwitch(kOriginalUrls)) {
  222. std::string original_urls_string =
  223. command_line.GetSwitchValueASCII(kOriginalUrls);
  224. original_urls = base::SplitString(
  225. original_urls_string, " ",
  226. base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
  227. if (original_urls.size() != urls.size())
  228. original_urls.clear();
  229. }
  230. for (size_t i = 0; i < urls.size(); ++i) {
  231. GURL url(urls[i]);
  232. if (url.is_valid()) {
  233. requests.push_back(new ContentExtractionRequest(url));
  234. // Only regard non-empty original urls.
  235. if (!original_urls.empty() && !original_urls[i].empty()) {
  236. (*file_to_url_map)[url.spec()] = original_urls[i];
  237. }
  238. } else {
  239. ADD_FAILURE() << "Bad url";
  240. }
  241. }
  242. }
  243. if (requests.empty()) {
  244. ADD_FAILURE() << "No valid url provided";
  245. }
  246. return requests.Pass();
  247. }
  248. private:
  249. ContentExtractionRequest(const GURL& url) : url_(url) {}
  250. void OnArticleUpdated(ArticleDistillationUpdate article_update) override {}
  251. void OnArticleReady(const DistilledArticleProto* article_proto) override {
  252. article_proto_ = article_proto;
  253. CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
  254. base::ThreadTaskRunnerHandle::Get()->PostTask(FROM_HERE,
  255. finished_callback_);
  256. }
  257. const DistilledArticleProto* article_proto_;
  258. scoped_ptr<ViewerHandle> viewer_handle_;
  259. GURL url_;
  260. base::Closure finished_callback_;
  261. };
  262. class ContentExtractor : public ContentBrowserTest {
  263. public:
  264. ContentExtractor()
  265. : pending_tasks_(0),
  266. max_tasks_(kMaxExtractorTasks),
  267. next_request_(0),
  268. output_data_(),
  269. protobuf_output_stream_(
  270. new google::protobuf::io::StringOutputStream(&output_data_)) {}
  271. // Change behavior of the default host resolver to avoid DNS lookup errors, so
  272. // we can make network calls.
  273. void SetUpOnMainThread() override {
  274. if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
  275. EnableDNSLookupForThisTest();
  276. }
  277. CHECK(db_dir_.CreateUniqueTempDir());
  278. AddComponentsTestResources();
  279. }
  280. void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
  281. protected:
  282. // Creates the DomDistillerService and creates and starts the extraction
  283. // request.
  284. void Start() {
  285. const base::CommandLine& command_line =
  286. *base::CommandLine::ForCurrentProcess();
  287. FileToUrlMap file_to_url_map;
  288. requests_ = ContentExtractionRequest::CreateForCommandLine(
  289. command_line, &file_to_url_map);
  290. content::BrowserContext* context =
  291. shell()->web_contents()->GetBrowserContext();
  292. service_ = CreateDomDistillerService(context,
  293. db_dir_.path(),
  294. file_to_url_map);
  295. PumpQueue();
  296. }
  297. void PumpQueue() {
  298. while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
  299. requests_[next_request_]->Start(
  300. service_.get(),
  301. shell()->web_contents()->GetContainerBounds().size(),
  302. base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
  303. ++next_request_;
  304. ++pending_tasks_;
  305. }
  306. }
  307. private:
  308. // Change behavior of the default host resolver to allow DNS lookup
  309. // to proceed instead of being blocked by the test infrastructure.
  310. void EnableDNSLookupForThisTest() {
  311. // mock_host_resolver_override_ takes ownership of the resolver.
  312. scoped_refptr<net::RuleBasedHostResolverProc> resolver =
  313. new net::RuleBasedHostResolverProc(host_resolver());
  314. resolver->AllowDirectLookup("*");
  315. mock_host_resolver_override_.reset(
  316. new net::ScopedDefaultHostResolverProc(resolver.get()));
  317. }
  318. // We need to reset the DNS lookup when we finish, or the test will fail.
  319. void DisableDNSLookupForThisTest() {
  320. mock_host_resolver_override_.reset();
  321. }
  322. void FinishRequest() {
  323. --pending_tasks_;
  324. if (next_request_ == requests_.size() && pending_tasks_ == 0) {
  325. Finish();
  326. } else {
  327. PumpQueue();
  328. }
  329. }
  330. void DoArticleOutput() {
  331. const base::CommandLine& command_line =
  332. *base::CommandLine::ForCurrentProcess();
  333. for (size_t i = 0; i < requests_.size(); ++i) {
  334. const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
  335. if (command_line.HasSwitch(kShouldOutputBinary)) {
  336. WriteProtobufWithSize(article, protobuf_output_stream_.get());
  337. } else {
  338. output_data_ += GetReadableArticleString(article) + "\n";
  339. }
  340. }
  341. if (command_line.HasSwitch(kOutputFile)) {
  342. base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile);
  343. ASSERT_EQ(
  344. (int)output_data_.size(),
  345. base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
  346. } else {
  347. VLOG(0) << output_data_;
  348. }
  349. }
  350. void Finish() {
  351. DoArticleOutput();
  352. requests_.clear();
  353. service_.reset();
  354. base::ThreadTaskRunnerHandle::Get()->PostTask(
  355. FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
  356. }
  357. size_t pending_tasks_;
  358. size_t max_tasks_;
  359. size_t next_request_;
  360. base::ScopedTempDir db_dir_;
  361. scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
  362. scoped_ptr<DomDistillerService> service_;
  363. ScopedVector<ContentExtractionRequest> requests_;
  364. std::string output_data_;
  365. scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
  366. };
  367. IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
  368. SetDistillerJavaScriptWorldId(content::ISOLATED_WORLD_ID_CONTENT_END);
  369. Start();
  370. base::RunLoop().Run();
  371. }
  372. } // namespace dom_distiller