/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java

https://github.com/MarginaliaSearch/MarginaliaSearch · Java · 99 lines · 78 code · 21 blank · 0 comment · 2 complexity · 03d34ec335cb612e7b511dfbb90df3bd MD5 · raw file

  1. package nu.marginalia.converting;
  2. import com.google.gson.Gson;
  3. import com.google.inject.Guice;
  4. import com.google.inject.Inject;
  5. import com.google.inject.Injector;
  6. import nu.marginalia.process.log.WorkLog;
  7. import plan.CrawlPlanLoader;
  8. import plan.CrawlPlan;
  9. import nu.marginalia.converting.compiler.InstructionsCompiler;
  10. import nu.marginalia.converting.instruction.Instruction;
  11. import nu.marginalia.converting.processor.DomainProcessor;
  12. import nu.marginalia.crawling.model.CrawledDomain;
  13. import nu.marginalia.util.ParallelPipe;
  14. import org.slf4j.Logger;
  15. import org.slf4j.LoggerFactory;
  16. import java.io.IOException;
  17. import java.nio.file.Path;
  18. import java.util.List;
  19. public class ConverterMain {
  20. private final Logger logger = LoggerFactory.getLogger(getClass());
  21. private final InstructionWriter instructionWriter;
  22. public static void main(String... args) throws IOException {
  23. if (args.length != 1) {
  24. System.err.println("Arguments: crawl-plan.yaml");
  25. System.exit(0);
  26. }
  27. var plan = new CrawlPlanLoader().load(Path.of(args[0]));
  28. Injector injector = Guice.createInjector(
  29. new ConverterModule(plan)
  30. );
  31. injector.getInstance(ConverterMain.class);
  32. }
  33. @Inject
  34. public ConverterMain(
  35. CrawlPlan plan,
  36. DomainProcessor processor,
  37. InstructionsCompiler compiler,
  38. Gson gson
  39. ) throws Exception {
  40. logger.info("Starting pipe");
  41. try (WorkLog processLog = plan.createProcessWorkLog();
  42. ConversionLog log = new ConversionLog(plan.process.getDir())) {
  43. instructionWriter = new InstructionWriter(log, plan.process.getDir(), gson);
  44. var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Converter", 16, 4, 2) {
  45. @Override
  46. protected ProcessingInstructions onProcess(CrawledDomain domainData) {
  47. Thread.currentThread().setName("Converter:Processor["+domainData.domain+"] - " + domainData.size());
  48. try {
  49. var processed = processor.process(domainData);
  50. var compiled = compiler.compile(processed);
  51. return new ProcessingInstructions(domainData.id, compiled);
  52. }
  53. finally {
  54. Thread.currentThread().setName("Converter:Processor[IDLE]");
  55. }
  56. }
  57. @Override
  58. protected void onReceive(ProcessingInstructions processedInstructions) throws IOException {
  59. Thread.currentThread().setName("Converter:Receiver["+processedInstructions.id+"]");
  60. try {
  61. var instructions = processedInstructions.instructions;
  62. instructions.removeIf(Instruction::isNoOp);
  63. String where = instructionWriter.accept(processedInstructions.id, instructions);
  64. processLog.setJobToFinished(processedInstructions.id, where, instructions.size());
  65. }
  66. finally {
  67. Thread.currentThread().setName("Converter:Receiver[IDLE]");
  68. }
  69. }
  70. };
  71. plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
  72. pipe.join();
  73. }
  74. logger.info("Finished");
  75. System.exit(0);
  76. }
  77. record ProcessingInstructions(String id, List<Instruction> instructions) {}
  78. }