PageRenderTime 56ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/components/dom_distiller/standalone/content_extractor_browsertest.cc

https://github.com/chromium/chromium
C++ | 439 lines | 346 code | 57 blank | 36 comment | 39 complexity | b4f4802289d48db28ecac120f5f40f6d MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0, BSD-3-Clause
  1. // Copyright 2014 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #include <stddef.h>
  5. #include <memory>
  6. #include <sstream>
  7. #include <unordered_map>
  8. #include <utility>
  9. #include "base/bind.h"
  10. #include "base/command_line.h"
  11. #include "base/containers/id_map.h"
  12. #include "base/files/file_util.h"
  13. #include "base/files/scoped_temp_dir.h"
  14. #include "base/location.h"
  15. #include "base/memory/scoped_refptr.h"
  16. #include "base/path_service.h"
  17. #include "base/run_loop.h"
  18. #include "base/strings/string_number_conversions.h"
  19. #include "base/strings/string_split.h"
  20. #include "base/task/single_thread_task_runner.h"
  21. #include "base/threading/thread_restrictions.h"
  22. #include "base/threading/thread_task_runner_handle.h"
  23. #include "components/dom_distiller/content/browser/distiller_javascript_utils.h"
  24. #include "components/dom_distiller/content/browser/distiller_page_web_contents.h"
  25. #include "components/dom_distiller/core/article_entry.h"
  26. #include "components/dom_distiller/core/distilled_page_prefs.h"
  27. #include "components/dom_distiller/core/distiller.h"
  28. #include "components/dom_distiller/core/dom_distiller_service.h"
  29. #include "components/dom_distiller/core/proto/distilled_article.pb.h"
  30. #include "components/dom_distiller/core/proto/distilled_page.pb.h"
  31. #include "components/dom_distiller/core/task_tracker.h"
  32. #include "components/leveldb_proto/public/proto_database.h"
  33. #include "components/leveldb_proto/public/proto_database_provider.h"
  34. #include "components/sync_preferences/testing_pref_service_syncable.h"
  35. #include "content/public/browser/browser_context.h"
  36. #include "content/public/browser/storage_partition.h"
  37. #include "content/public/common/isolated_world_ids.h"
  38. #include "content/public/test/browser_test.h"
  39. #include "content/public/test/content_browser_test.h"
  40. #include "content/shell/browser/shell.h"
  41. #include "google/protobuf/io/coded_stream.h"
  42. #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
  43. #include "net/dns/mock_host_resolver.h"
  44. #include "third_party/dom_distiller_js/dom_distiller.pb.h"
  45. #include "ui/base/resource/resource_bundle.h"
  46. using content::ContentBrowserTest;
  47. namespace dom_distiller {
  48. namespace {
  49. typedef std::unordered_map<std::string, std::string> FileToUrlMap;
  50. }
  51. // Factory for creating a Distiller that creates different DomDistillerOptions
  52. // for different URLs, i.e. a specific kOriginalUrl option for each URL.
  53. class TestDistillerFactoryImpl : public DistillerFactory {
  54. public:
  55. TestDistillerFactoryImpl(
  56. std::unique_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
  57. const dom_distiller::proto::DomDistillerOptions& dom_distiller_options,
  58. const FileToUrlMap& file_to_url_map)
  59. : distiller_url_fetcher_factory_(
  60. std::move(distiller_url_fetcher_factory)),
  61. dom_distiller_options_(dom_distiller_options),
  62. file_to_url_map_(file_to_url_map) {}
  63. ~TestDistillerFactoryImpl() override {}
  64. std::unique_ptr<Distiller> CreateDistillerForUrl(const GURL& url) override {
  65. dom_distiller::proto::DomDistillerOptions options;
  66. options = dom_distiller_options_;
  67. FileToUrlMap::const_iterator it = file_to_url_map_.find(url.spec());
  68. if (it != file_to_url_map_.end()) {
  69. options.set_original_url(it->second);
  70. }
  71. return std::make_unique<DistillerImpl>(*distiller_url_fetcher_factory_,
  72. options);
  73. }
  74. private:
  75. std::unique_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory_;
  76. dom_distiller::proto::DomDistillerOptions dom_distiller_options_;
  77. FileToUrlMap file_to_url_map_;
  78. };
  79. namespace {
  80. // The url to distill.
  81. const char* kUrlSwitch = "url";
  82. // A space-separated list of urls to distill.
  83. const char* kUrlsSwitch = "urls";
  84. // Indicates that DNS resolution should be disabled for this test.
  85. const char* kDisableDnsSwitch = "disable-dns";
  86. // Will write the distilled output to the given file instead of to stdout.
  87. const char* kOutputFile = "output-file";
  88. // Indicates to output a serialized protocol buffer instead of human-readable
  89. // output.
  90. const char* kShouldOutputBinary = "output-binary";
  91. // Indicates to output only the text of the article and not the enclosing html.
  92. const char* kExtractTextOnly = "extract-text-only";
  93. // Indicates to include debug output.
  94. const char* kDebugLevel = "debug-level";
  95. // The original URL of the page if |kUrlSwitch| is a file.
  96. const char* kOriginalUrl = "original-url";
  97. // A semi-colon-separated (i.e. ';') list of original URLs corresponding to
  98. // "kUrlsSwitch".
  99. const char* kOriginalUrls = "original-urls";
  100. // The pagination algorithm to use, one of "next", "pagenum".
  101. const char* kPaginationAlgo = "pagination-algo";
  102. // Maximum number of concurrent started extractor requests.
  103. const int kMaxExtractorTasks = 8;
  104. std::unique_ptr<DomDistillerService> CreateDomDistillerService(
  105. content::BrowserContext* context,
  106. sync_preferences::TestingPrefServiceSyncable* pref_service,
  107. const FileToUrlMap& file_to_url_map) {
  108. // Setting up PrefService for DistilledPagePrefs.
  109. DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
  110. auto distiller_page_factory =
  111. std::make_unique<DistillerPageWebContentsFactory>(context);
  112. auto distiller_url_fetcher_factory =
  113. std::make_unique<DistillerURLFetcherFactory>(
  114. context->GetDefaultStoragePartition()
  115. ->GetURLLoaderFactoryForBrowserProcess());
  116. dom_distiller::proto::DomDistillerOptions options;
  117. if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
  118. options.set_extract_text_only(true);
  119. }
  120. int debug_level = 0;
  121. if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
  122. base::StringToInt(
  123. base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
  124. kDebugLevel),
  125. &debug_level)) {
  126. options.set_debug_level(debug_level);
  127. }
  128. // Options for pagination algorithm:
  129. // - "next": detect anchors with "next" text
  130. // - "pagenum": detect anchors with numeric page numbers
  131. // Default is "next".
  132. if (base::CommandLine::ForCurrentProcess()->HasSwitch(kPaginationAlgo)) {
  133. options.set_pagination_algo(
  134. base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
  135. kPaginationAlgo));
  136. }
  137. auto distiller_factory = std::make_unique<TestDistillerFactoryImpl>(
  138. std::move(distiller_url_fetcher_factory), options, file_to_url_map);
  139. return std::make_unique<DomDistillerService>(
  140. std::move(distiller_factory), std::move(distiller_page_factory),
  141. std::make_unique<DistilledPagePrefs>(pref_service),
  142. /* distiller_ui_handle */ nullptr);
  143. }
  144. void AddComponentsTestResources() {
  145. base::FilePath pak_file;
  146. base::FilePath pak_dir;
  147. base::PathService::Get(base::DIR_ASSETS, &pak_dir);
  148. pak_file =
  149. pak_dir.Append(FILE_PATH_LITERAL("components_tests_resources.pak"));
  150. ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
  151. pak_file, ui::kScaleFactorNone);
  152. }
  153. bool WriteProtobufWithSize(
  154. const google::protobuf::MessageLite& message,
  155. google::protobuf::io::ZeroCopyOutputStream* output_stream) {
  156. google::protobuf::io::CodedOutputStream coded_output(output_stream);
  157. // Write the size.
  158. const int size = message.ByteSize();
  159. coded_output.WriteLittleEndian32(size);
  160. message.SerializeWithCachedSizes(&coded_output);
  161. return !coded_output.HadError();
  162. }
  163. std::string GetReadableArticleString(
  164. const DistilledArticleProto& article_proto) {
  165. std::stringstream output;
  166. output << "Article Title: " << article_proto.title() << std::endl;
  167. output << "# of pages: " << article_proto.pages_size() << std::endl;
  168. for (int i = 0; i < article_proto.pages_size(); ++i) {
  169. if (i > 0) output << std::endl;
  170. const DistilledPageProto& page = article_proto.pages(i);
  171. output << "Page " << i << std::endl;
  172. output << "URL: " << page.url() << std::endl;
  173. output << "Content: " << page.html() << std::endl;
  174. if (page.has_debug_info() && page.debug_info().has_log())
  175. output << "Log: " << page.debug_info().log() << std::endl;
  176. if (page.has_pagination_info()) {
  177. if (page.pagination_info().has_next_page()) {
  178. output << "Next Page: " << page.pagination_info().next_page()
  179. << std::endl;
  180. }
  181. if (page.pagination_info().has_prev_page()) {
  182. output << "Prev Page: " << page.pagination_info().prev_page()
  183. << std::endl;
  184. }
  185. }
  186. }
  187. return output.str();
  188. }
  189. } // namespace
  190. class ContentExtractionRequest : public ViewRequestDelegate {
  191. public:
  192. ContentExtractionRequest(const GURL& url) : url_(url) {}
  193. void Start(DomDistillerService* service,
  194. const gfx::Size& render_view_size,
  195. base::OnceClosure finished_callback) {
  196. finished_callback_ = std::move(finished_callback);
  197. viewer_handle_ =
  198. service->ViewUrl(this,
  199. service->CreateDefaultDistillerPage(render_view_size),
  200. url_);
  201. }
  202. DistilledArticleProto GetArticleCopy() {
  203. return *article_proto_;
  204. }
  205. static std::vector<std::unique_ptr<ContentExtractionRequest>>
  206. CreateForCommandLine(const base::CommandLine& command_line,
  207. FileToUrlMap* file_to_url_map) {
  208. std::vector<std::unique_ptr<ContentExtractionRequest>> requests;
  209. if (command_line.HasSwitch(kUrlSwitch)) {
  210. GURL url;
  211. std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
  212. url = GURL(url_string);
  213. if (url.is_valid()) {
  214. requests.push_back(std::make_unique<ContentExtractionRequest>(url));
  215. if (command_line.HasSwitch(kOriginalUrl)) {
  216. (*file_to_url_map)[url.spec()] =
  217. command_line.GetSwitchValueASCII(kOriginalUrl);
  218. }
  219. }
  220. } else if (command_line.HasSwitch(kUrlsSwitch)) {
  221. std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
  222. std::vector<std::string> urls = base::SplitString(
  223. urls_string, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
  224. // Check for original-urls switch, which must exactly pair up with
  225. // |kUrlsSwitch| i.e. number of original urls must be same as that of
  226. // urls.
  227. std::vector<std::string> original_urls;
  228. if (command_line.HasSwitch(kOriginalUrls)) {
  229. std::string original_urls_string =
  230. command_line.GetSwitchValueASCII(kOriginalUrls);
  231. original_urls = base::SplitString(
  232. original_urls_string, " ",
  233. base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
  234. if (original_urls.size() != urls.size())
  235. original_urls.clear();
  236. }
  237. for (size_t i = 0; i < urls.size(); ++i) {
  238. GURL url(urls[i]);
  239. if (url.is_valid()) {
  240. requests.push_back(std::make_unique<ContentExtractionRequest>(url));
  241. // Only regard non-empty original urls.
  242. if (!original_urls.empty() && !original_urls[i].empty()) {
  243. (*file_to_url_map)[url.spec()] = original_urls[i];
  244. }
  245. } else {
  246. ADD_FAILURE() << "Bad url";
  247. }
  248. }
  249. }
  250. if (requests.empty()) {
  251. ADD_FAILURE() << "No valid url provided";
  252. }
  253. return requests;
  254. }
  255. private:
  256. void OnArticleUpdated(ArticleDistillationUpdate article_update) override {}
  257. void OnArticleReady(const DistilledArticleProto* article_proto) override {
  258. article_proto_ = article_proto;
  259. CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
  260. base::ThreadTaskRunnerHandle::Get()->PostTask(
  261. FROM_HERE, std::move(finished_callback_));
  262. }
  263. const DistilledArticleProto* article_proto_;
  264. std::unique_ptr<ViewerHandle> viewer_handle_;
  265. GURL url_;
  266. base::OnceClosure finished_callback_;
  267. };
  268. class ContentExtractor : public ContentBrowserTest {
  269. public:
  270. ContentExtractor()
  271. : pending_tasks_(0),
  272. max_tasks_(kMaxExtractorTasks),
  273. next_request_(0),
  274. output_data_(),
  275. protobuf_output_stream_(
  276. std::make_unique<google::protobuf::io::StringOutputStream>(
  277. &output_data_)) {}
  278. // Change behavior of the default host resolver to avoid DNS lookup errors, so
  279. // we can make network calls.
  280. void SetUpOnMainThread() override {
  281. if (!base::CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
  282. EnableDNSLookupForThisTest();
  283. }
  284. AddComponentsTestResources();
  285. }
  286. void TearDownOnMainThread() override { DisableDNSLookupForThisTest(); }
  287. protected:
  288. // Creates the DomDistillerService and creates and starts the extraction
  289. // request.
  290. void Start() {
  291. const base::CommandLine& command_line =
  292. *base::CommandLine::ForCurrentProcess();
  293. FileToUrlMap file_to_url_map;
  294. requests_ = ContentExtractionRequest::CreateForCommandLine(
  295. command_line, &file_to_url_map);
  296. content::BrowserContext* context =
  297. shell()->web_contents()->GetBrowserContext();
  298. pref_service_ =
  299. std::make_unique<sync_preferences::TestingPrefServiceSyncable>();
  300. service_ = CreateDomDistillerService(context, pref_service_.get(),
  301. file_to_url_map);
  302. PumpQueue();
  303. }
  304. void PumpQueue() {
  305. while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
  306. requests_[next_request_]->Start(
  307. service_.get(), shell()->web_contents()->GetContainerBounds().size(),
  308. base::BindOnce(&ContentExtractor::FinishRequest,
  309. base::Unretained(this)));
  310. ++next_request_;
  311. ++pending_tasks_;
  312. }
  313. }
  314. private:
  315. // Change behavior of the default host resolver to allow DNS lookup
  316. // to proceed instead of being blocked by the test infrastructure.
  317. void EnableDNSLookupForThisTest() {
  318. // mock_host_resolver_override_ takes ownership of the resolver.
  319. auto resolver =
  320. base::MakeRefCounted<net::RuleBasedHostResolverProc>(host_resolver());
  321. resolver->AllowDirectLookup("*");
  322. mock_host_resolver_override_ =
  323. std::make_unique<net::ScopedDefaultHostResolverProc>(resolver.get());
  324. }
  325. // We need to reset the DNS lookup when we finish, or the test will fail.
  326. void DisableDNSLookupForThisTest() {
  327. mock_host_resolver_override_.reset();
  328. }
  329. void FinishRequest() {
  330. --pending_tasks_;
  331. if (next_request_ == requests_.size() && pending_tasks_ == 0) {
  332. Finish();
  333. } else {
  334. PumpQueue();
  335. }
  336. }
  337. void DoArticleOutput() {
  338. base::ScopedAllowBlockingForTesting allow_blocing;
  339. const base::CommandLine& command_line =
  340. *base::CommandLine::ForCurrentProcess();
  341. for (size_t i = 0; i < requests_.size(); ++i) {
  342. const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
  343. if (command_line.HasSwitch(kShouldOutputBinary)) {
  344. WriteProtobufWithSize(article, protobuf_output_stream_.get());
  345. } else {
  346. output_data_ += GetReadableArticleString(article) + "\n";
  347. }
  348. }
  349. if (command_line.HasSwitch(kOutputFile)) {
  350. base::FilePath filename = command_line.GetSwitchValuePath(kOutputFile);
  351. ASSERT_EQ(
  352. (int)output_data_.size(),
  353. base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
  354. } else {
  355. VLOG(0) << output_data_;
  356. }
  357. }
  358. void Finish() {
  359. DoArticleOutput();
  360. requests_.clear();
  361. service_.reset();
  362. base::ThreadTaskRunnerHandle::Get()->PostTask(
  363. FROM_HERE, base::RunLoop::QuitCurrentWhenIdleClosureDeprecated());
  364. }
  365. size_t pending_tasks_;
  366. size_t max_tasks_;
  367. size_t next_request_;
  368. std::unique_ptr<net::ScopedDefaultHostResolverProc>
  369. mock_host_resolver_override_;
  370. std::unique_ptr<sync_preferences::TestingPrefServiceSyncable> pref_service_;
  371. std::unique_ptr<DomDistillerService> service_;
  372. std::vector<std::unique_ptr<ContentExtractionRequest>> requests_;
  373. std::string output_data_;
  374. std::unique_ptr<google::protobuf::io::StringOutputStream>
  375. protobuf_output_stream_;
  376. };
  377. IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
  378. SetDistillerJavaScriptWorldId(content::ISOLATED_WORLD_ID_CONTENT_END);
  379. Start();
  380. base::RunLoop().Run();
  381. }
  382. } // namespace dom_distiller