PageRenderTime 55ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 1ms

/marytts-builder/src/main/java/marytts/tools/dbselection/DatabaseSelector.java

http://github.com/marytts/marytts
Java | 914 lines | 642 code | 95 blank | 177 comment | 97 complexity | 7886549be6f1d60f9e4b88e8b60db266 MD5 | raw file
Possible License(s): LGPL-2.0, BSD-3-Clause, LGPL-2.1, 0BSD
  1. /**
  2. * Copyright 2007 DFKI GmbH.
  3. * All Rights Reserved. Use is subject to license terms.
  4. *
  5. * This file is part of MARY TTS.
  6. *
  7. * MARY TTS is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation, version 3 of the License.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public License
  17. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. *
  19. */
  20. package marytts.tools.dbselection;
  21. import java.io.BufferedReader;
  22. import java.io.BufferedWriter;
  23. import java.io.ByteArrayInputStream;
  24. import java.io.ByteArrayOutputStream;
  25. import java.io.File;
  26. import java.io.FileInputStream;
  27. import java.io.FileOutputStream;
  28. import java.io.FileWriter;
  29. import java.io.IOException;
  30. import java.io.InputStreamReader;
  31. import java.io.OutputStreamWriter;
  32. import java.io.PrintWriter;
  33. import java.net.UnknownHostException;
  34. import java.text.DateFormat;
  35. import java.text.SimpleDateFormat;
  36. import java.util.Date;
  37. import java.util.LinkedHashSet;
  38. import java.util.Set;
  39. import javax.sound.sampled.UnsupportedAudioFileException;
  40. import javax.xml.parsers.ParserConfigurationException;
  41. import javax.xml.transform.TransformerConfigurationException;
  42. import javax.xml.transform.TransformerException;
  43. import org.w3c.dom.Document;
  44. import org.w3c.dom.Element;
  45. import org.w3c.dom.traversal.TreeWalker;
  46. import org.xml.sax.SAXException;
  47. import marytts.client.MaryClient;
  48. import marytts.datatypes.MaryData;
  49. import marytts.datatypes.MaryDataType;
  50. import marytts.datatypes.MaryXML;
  51. import marytts.features.FeatureDefinition;
  52. import marytts.server.Mary;
  53. import marytts.util.Pair;
  54. import marytts.util.dom.MaryDomUtils;
  55. import marytts.util.http.Address;
  56. import org.apache.commons.io.FileUtils;
  57. /**
  58. * Main class to be run over a database for selection
  59. *
  60. * @author Anna Hunecke
  61. *
  62. */
  63. public class DatabaseSelector
  64. {
  65. private static String locale;
  66. //the feature definition for the feature vectors
  67. public static FeatureDefinition featDef;
  68. //the file containing the feature definition
  69. private static String featDefFileName;
  70. //the file containing the coverage data needed to initialise the algorithm
  71. private static String initFileName;
  72. //the directory to print the selection results to
  73. private static String selectionDirName;
  74. //the config file for the coverage definition
  75. private static String covDefConfigFileName;
  76. //the stop criterion (as string)
  77. private static String stopCriterion;
  78. //the log file to log the result to
  79. private static String overallLogFile;
  80. //if true, feature vectors are kept in memory
  81. private static boolean holdVectorsInMemory;
  82. //if true, print more information to command line
  83. private static boolean verbose;
  84. //if true, print a table containing the coverage
  85. //development over time
  86. private static boolean logCovDevelopment;
  87. //private static List of selected sentences ids;
  88. private static Set<Integer> selectedIdSents;
  89. private static Set<Integer> unwantedIdSents;
  90. private static String selectedSentencesTableName;
  91. private static String tableDescription;
  92. // mySql database
  93. protected static DBHandler wikiToDB;
  94. private static String mysqlHost;
  95. private static String mysqlDB;
  96. private static String mysqlUser;
  97. private static String mysqlPasswd;
  98. private static boolean considerOnlyReliableSentences;
  99. /**
  100. * Main method to be run from the directory where the data is.
  101. * Expects already computed unit features in directory unitfeatures
  102. *
  103. * @param args the command line args (see printUsage for details)
  104. */
  105. public static void main(String[] args)throws Exception{
  106. main2(args);
  107. }
  108. /**
  109. * Main method to be run from the directory where the data is.
  110. * Expects already computed unit features in directory unitfeatures.
  111. *
  112. * @param args the command line args (see printUsage for details)
  113. *
  114. * @return the array of feature vectors used in the current pass
  115. */
  116. public static void main2(String[] args)
  117. throws Exception
  118. {
  119. /* Sort out the filenames and dirs for the logfiles */
  120. System.out.println("Starting Database Selection...");
  121. long time = System.currentTimeMillis();
  122. PrintWriter logOut;
  123. String dateString = "", dateDir = "";
  124. DateFormat fullDate = new SimpleDateFormat("dd_MM_yyyy_HH_mm_ss");
  125. DateFormat day = new SimpleDateFormat("dd_MM_yyyy");
  126. Date date = new Date();
  127. dateString = fullDate.format(date);
  128. dateDir = day.format(date);
  129. System.out.println("Reading arguments ...");
  130. StringBuffer logBuf = new StringBuffer();
  131. if (!readArgs(args,logBuf)){
  132. throw new Exception("Something wrong with the arguments.");
  133. }
  134. //make sure the stop criterion is allright
  135. SelectionFunction selFunc = new SelectionFunction();
  136. if (!selFunc.stopIsOkay(stopCriterion)){
  137. System.out.println("Stop criterion format is wrong: " + stopCriterion);
  138. printUsage();
  139. throw new Exception("Stop criterion format is wrong: " + stopCriterion);
  140. }
  141. //make various dirs
  142. File selectionDir = new File(selectionDirName);
  143. if (!selectionDir.exists())
  144. selectionDir.mkdir();
  145. File dateDirFile = new File(selectionDirName+dateDir);
  146. if (!dateDirFile.exists())
  147. dateDirFile.mkdir();
  148. //open log file
  149. String filename = selectionDirName + dateDir + "/selectionLog_" + dateString + ".txt";
  150. try{
  151. logOut = new PrintWriter(new BufferedWriter(new FileWriter(new File(filename))),true);
  152. } catch (Exception e){
  153. e.printStackTrace();
  154. throw new Exception("Error opening logfile");
  155. }
  156. //print date and arguments to log file
  157. logOut.println("Date: "+dateString);
  158. logOut.println(logBuf.toString());
  159. wikiToDB = new DBHandler(locale);
  160. // Check if name of selectedSentencesTable has to be changed
  161. if(selectedSentencesTableName != null)
  162. wikiToDB.setSelectedSentencesTableName(selectedSentencesTableName);
  163. else
  164. System.out.println("Current selected sentences table name = " + selectedSentencesTableName);
  165. // If connection succeed
  166. if( wikiToDB.createDBConnection(mysqlHost,mysqlDB,mysqlUser,mysqlPasswd) ) {
  167. /* Read in the feature definition */
  168. System.out.println("\nLoading feature definition...");
  169. try {
  170. BufferedReader uttFeats = new BufferedReader(new InputStreamReader(
  171. new FileInputStream(new File( featDefFileName )), "UTF-8"));
  172. featDef = new FeatureDefinition(uttFeats, false);
  173. uttFeats.close();
  174. System.out.println("TARGETFEATURES:" + featDef.getNumberOfFeatures() + " = " + featDef.getFeatureNames());
  175. } catch (Exception e){
  176. e.printStackTrace();
  177. throw new Exception("Error opening featureDefinition file");
  178. }
  179. System.out.println("Getting a list of ids for all the sentences in the DB...");
  180. System.out.println("(if the number of sentences is large, this can take a while)");
  181. System.out.println();
  182. String condition = null;
  183. if (considerOnlyReliableSentences) {
  184. condition = "reliable=true";
  185. }
  186. CoverageFeatureProvider cfp;
  187. if (holdVectorsInMemory) {
  188. /* Load the feature vectors from the database */
  189. System.out.println("Will also load feature vectors into memory (increase memory if this fails)");
  190. Pair<int[], byte[][]> pair = wikiToDB.getIdsAndFeatureVectors("dbselection", condition);
  191. int[] sentenceIDs = pair.getFirst();
  192. byte[][] vectorArray = pair.getSecond();
  193. cfp = new InMemoryCFProvider(vectorArray, sentenceIDs);
  194. } else {
  195. cfp = new DatabaseCFProvider(wikiToDB, condition);
  196. }
  197. /* Initialise the coverage definition */
  198. System.out.println("\nInitiating coverage...");
  199. CoverageDefinition covDef = new CoverageDefinition(featDef, cfp, covDefConfigFileName);
  200. // If the selectedSentencesTable is new, (does not exist) then a new table
  201. // will be created, the selected field in the dbselection table will be initialised to selected=false.
  202. // The sentences already marke in this db as unwanted=true will be kept.
  203. wikiToDB.createSelectedSentencesTable(stopCriterion, featDefFileName, covDefConfigFileName);
  204. // With the information provided by the user
  205. wikiToDB.setTableDescription(wikiToDB.getSelectedSentencesTableName(), tableDescription,
  206. stopCriterion,featDefFileName, covDefConfigFileName);
  207. long startTime = System.currentTimeMillis();
  208. File covSetFile = new File(initFileName);
  209. boolean readCovFromFile = true;
  210. if (!covSetFile.exists()){
  211. //coverage has to be initialised
  212. readCovFromFile = false;
  213. covDef.initialiseCoverage();
  214. System.out.println("\nWriting coverage to file "+initFileName);
  215. covDef.writeCoverageBin(initFileName);
  216. } else {
  217. condition = null;
  218. if (considerOnlyReliableSentences) {
  219. condition = "reliable=true";
  220. }
  221. int[] idSentenceList = wikiToDB.getIdListOfType("dbselection", condition);
  222. covDef.readCoverageBin(initFileName,idSentenceList);
  223. }
  224. /* add already selected sentences to cover */
  225. System.out.println("\nAdd to cover already selected sentences marked as unwanted=false.");
  226. selectedIdSents = new LinkedHashSet<Integer>();
  227. addSelectedSents(selectedSentencesTableName, covDef);
  228. /* remove unwanted sentences from basename list */
  229. System.out.println("\nRemoving selected sentences marked as unwanted=true.");
  230. unwantedIdSents = new LinkedHashSet<Integer>();
  231. removeUnwantedSentences(selectedSentencesTableName);
  232. long startDuration = System.currentTimeMillis() -startTime;
  233. if (verbose)
  234. System.out.println("Startup took "+startDuration+" milliseconds");
  235. logOut.println("Startup took "+startDuration+" milliseconds");
  236. /* print text corpus statistics */
  237. if (!readCovFromFile){
  238. //only print if we did not read from file
  239. filename = selectionDirName+"textcorpus_distribution.txt";
  240. System.out.println("Printing text corpus statistics to "+filename+"...");
  241. PrintWriter out = null;
  242. try{
  243. out = new PrintWriter(new FileWriter(new File(filename)),true);
  244. covDef.printTextCorpusStatistics(out);
  245. } catch (Exception e){
  246. e.printStackTrace();
  247. throw new Exception("Error printing statistics");
  248. } finally {
  249. out.close();
  250. }
  251. }
  252. //print settings of the coverage definition to log file
  253. covDef.printSettings(logOut);
  254. /* Start the algorithm */
  255. System.out.println("\nSelecting sentences...");
  256. // If it is not already running (could happen when SynthesisScriptGUI is used)
  257. // Start builtin MARY TTS in order to get and save the transcription
  258. // of the selected sentences (selected_text_transcription.log)
  259. if (Mary.currentState() == Mary.STATE_OFF)
  260. {
  261. System.out.print("Starting builtin MARY TTS...");
  262. Mary.startup();
  263. System.out.println(" MARY TTS started.");
  264. }
  265. //selFunc.select(selectedSents,covDef,logOut,basenameList,holdVectorsInMemory,verbose);
  266. selFunc.select(selectedIdSents,unwantedIdSents,covDef,logOut,cfp,verbose,wikiToDB);
  267. /* Store list of selected files */
  268. filename = selectionDirName+dateDir + "/selectionResult_" + dateString + ".txt";
  269. //storeResult(filename,selectedSents);
  270. storeResult(filename,selectedIdSents);
  271. /* print statistics */
  272. System.out.println("Printing selection distribution and table...");
  273. String disFile = selectionDirName+dateDir + "/selectionDistribution_" + dateString + ".txt";
  274. String devFile = selectionDirName+dateDir + "/selectionDevelopment_" + dateString + ".txt";
  275. try{
  276. covDef.printSelectionDistribution(disFile,devFile,logCovDevelopment);
  277. } catch (Exception e){
  278. e.printStackTrace();
  279. throw new Exception("Error printing statistics");
  280. }
  281. if (overallLogFile != null){
  282. //append results to end of overall log file
  283. PrintWriter overallLogOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(
  284. new File(overallLogFile),true),"UTF-8"),true);
  285. overallLogOut.println("*******************************\n" + "Results for "+dateString+":");
  286. //overallLogOut.println("number of basenames "+basenameList.length);
  287. overallLogOut.println("number of basenames "+cfp.getNumSentences());
  288. overallLogOut.println("Stop criterion "+stopCriterion);
  289. covDef.printResultToLog(overallLogOut);
  290. overallLogOut.close();
  291. }
  292. //print timing information
  293. long elapsedTime = System.currentTimeMillis() - time;
  294. double minutes = (double)elapsedTime/(double)1000/(double)60;
  295. System.out.println("Selection took "+minutes+" minutes("+elapsedTime+" milliseconds)");
  296. logOut.println("Selection took "+minutes+" minutes ("+elapsedTime+" milliseconds)");
  297. logOut.flush();
  298. logOut.close();
  299. wikiToDB.closeDBConnection();
  300. System.out.println("All done!");
  301. } else { // connection did not succeed
  302. System.out.println("\nERROR: Problems with connection to the DB, please check the mysql parameters.");
  303. throw new Exception("ERROR: Problems with connection to the DB, please check the mysql parameters.");
  304. }
  305. }
  306. /**
  307. * Read and check the command line arguments
  308. *
  309. * @param args the arguments
  310. * @param log a StringBufffer for logging
  311. * @return true if args can be parsed and all essential args are there,
  312. * false otherwise
  313. */
  314. private static boolean readArgs(String[] args,StringBuffer log) throws Exception{
  315. //initialise default values
  316. String currentDir = System.getProperty("user.dir");
  317. String maryBaseDir = System.getenv("MARY_BASE");
  318. System.out.println("Current directory: " + currentDir + " MARY_BASE=" + maryBaseDir);
  319. locale = null;
  320. selectionDirName = null;
  321. initFileName = null;
  322. covDefConfigFileName = null;
  323. featDefFileName = null;
  324. overallLogFile = null;
  325. holdVectorsInMemory = true;
  326. verbose = false;
  327. logCovDevelopment = false;
  328. mysqlHost = null;
  329. mysqlDB = null;
  330. mysqlUser = null;
  331. mysqlPasswd = null;
  332. selectedSentencesTableName = null;
  333. tableDescription = "";
  334. considerOnlyReliableSentences = true;
  335. stopCriterion = null;
  336. // Default values for
  337. holdVectorsInMemory = true;
  338. verbose = false;
  339. logCovDevelopment = false;
  340. int i=0;
  341. int numEssentialArgs = 0;
  342. //loop over args
  343. while (args.length > i){
  344. if (args[i].equals("-locale")){
  345. if (args.length > i+1){
  346. i++;
  347. locale = args[i];
  348. log.append("locale : "+args[i]+"\n");
  349. System.out.println(" locale : "+args[i]);
  350. numEssentialArgs++;
  351. } else {
  352. System.out.println("No locale.");
  353. printUsage();
  354. return false;
  355. }
  356. i++;
  357. continue;
  358. }
  359. if (args[i].equals("-mysqlHost")){
  360. if (args.length > i+1){
  361. i++;
  362. mysqlHost = args[i];
  363. log.append("mysqlHost : "+args[i]+"\n");
  364. System.out.println(" mysqlHost : "+args[i]);
  365. numEssentialArgs++;
  366. } else {
  367. System.out.println("No mysqlHost.");
  368. printUsage();
  369. return false;
  370. }
  371. i++;
  372. continue;
  373. }
  374. if (args[i].equals("-mysqlDB")){
  375. if (args.length > i+1){
  376. i++;
  377. mysqlDB = args[i];
  378. log.append("mysqlDB : "+args[i]+"\n");
  379. System.out.println(" mysqlDB : "+args[i]);
  380. numEssentialArgs++;
  381. } else {
  382. System.out.println("No mysqlDB.");
  383. printUsage();
  384. return false;
  385. }
  386. i++;
  387. continue;
  388. }
  389. if (args[i].equals("-mysqlUser")){
  390. if (args.length > i+1){
  391. i++;
  392. mysqlUser = args[i];
  393. log.append("mysqlUser : "+args[i]+"\n");
  394. System.out.println(" mysqlUser : "+args[i]);
  395. numEssentialArgs++;
  396. } else {
  397. System.out.println("No mysqlUser.");
  398. printUsage();
  399. return false;
  400. }
  401. i++;
  402. continue;
  403. }
  404. if (args[i].equals("-mysqlPasswd")){
  405. if (args.length > i+1){
  406. i++;
  407. mysqlPasswd = args[i];
  408. log.append("mysqlPasswd : "+args[i]+"\n");
  409. System.out.println(" mysqlPasswd : "+args[i]);
  410. numEssentialArgs++;
  411. } else {
  412. System.out.println("No mysqlPasswd.");
  413. printUsage();
  414. return false;
  415. }
  416. i++;
  417. continue;
  418. }
  419. if (args[i].equals("-featDef")){
  420. if (args.length > i+1){
  421. i++;
  422. featDefFileName = args[i];
  423. log.append("FeatDefFileName : "+args[i]+"\n");
  424. System.out.println(" FeatDefFileName : "+args[i]);
  425. } else {
  426. System.out.println("No featDef file");
  427. printUsage();
  428. return false;
  429. }
  430. i++;
  431. continue;
  432. }
  433. if (args[i].equals("-initFile")){
  434. if (args.length > i+1){
  435. i++;
  436. initFileName = args[i];
  437. log.append("initFile : "+args[i]+"\n");
  438. System.out.println(" initFile : "+args[i]);
  439. } else {
  440. System.out.println("No initFile");
  441. printUsage();
  442. return false;
  443. }
  444. i++;
  445. continue;
  446. }
  447. if (args[i].equals("-tableName")){
  448. if (args.length > i+1){
  449. i++;
  450. selectedSentencesTableName = args[i];
  451. log.append("selectedSentencesTable name : "+args[i]+"\n");
  452. System.out.println(" selectedSentencesTable name: "+args[i]);
  453. numEssentialArgs++;
  454. } else {
  455. System.out.println("No selectedSentencesTable name");
  456. printUsage();
  457. return false;
  458. }
  459. i++;
  460. continue;
  461. }
  462. if (args[i].equals("-tableDescription")){
  463. if (args.length > i+1){
  464. i++;
  465. tableDescription = args[i];
  466. log.append("tableDescription : "+args[i]+"\n");
  467. System.out.println(" tableDescription: "+args[i]);
  468. } else {
  469. System.out.println("No tableDescription");
  470. printUsage();
  471. return false;
  472. }
  473. i++;
  474. continue;
  475. }
  476. if (args[i].equals("-vectorsOnDisk")){
  477. holdVectorsInMemory = false;
  478. log.append("vectorsOnDisk");
  479. System.out.println(" vectorsOnDisk");
  480. i++;
  481. continue;
  482. }
  483. if (args[i].equals("-verbose")){
  484. verbose = true;
  485. log.append("verbose");
  486. System.out.println(" verbose");
  487. i++;
  488. continue;
  489. }
  490. if (args[i].equals("-logCoverageDevelopment")){
  491. logCovDevelopment = true;
  492. log.append("logCoverageDevelopment");
  493. System.out.println(" logCoverageDevelopment");
  494. i++;
  495. continue;
  496. }
  497. if (args[i].equals("-selectionDir")){
  498. if (args.length > i+1){
  499. i++;
  500. selectionDirName = args[i];
  501. //make sure we have a slash at the end
  502. char lastChar =
  503. selectionDirName.charAt(selectionDirName.length()-1);
  504. if (Character.isLetterOrDigit(lastChar)){
  505. selectionDirName = selectionDirName+"/";
  506. }
  507. log.append("selectionDir : "+args[i]+"\n");
  508. System.out.println(" selectionDir : "+args[i]);
  509. } else {
  510. System.out.println("No selectionDir");
  511. printUsage();
  512. return false;
  513. }
  514. i++;
  515. continue;
  516. }
  517. if (args[i].equals("-coverageConfig")){
  518. if (args.length > i+1){
  519. i++;
  520. covDefConfigFileName = args[i];
  521. log.append("coverageConfig : "+args[i]+"\n");
  522. System.out.println(" coverageConfig : "+args[i]);
  523. } else {
  524. System.out.println("No coverageConfig");
  525. printUsage();
  526. return false;
  527. }
  528. i++;
  529. continue;
  530. }
  531. if (args[i].equals("-stop")){
  532. StringBuilder tmp = new StringBuilder();
  533. i++;
  534. while (args.length > i){
  535. if (args[i].startsWith("-")) break;
  536. tmp.append(args[i]+" ");
  537. i++;
  538. }
  539. stopCriterion = tmp.toString();
  540. log.append("stop criterion : "+stopCriterion+"\n");
  541. System.out.println(" stop criterion : "+stopCriterion);
  542. continue;
  543. }
  544. if (args[i].equals("-overallLog")){
  545. if (args.length > i+1){
  546. i++;
  547. overallLogFile = args[i];
  548. log.append("overallLogFile : "+args[i]+"\n");
  549. System.out.println(" overallLogFile : "+args[i]);
  550. } else {
  551. System.out.println("No overall log file");
  552. printUsage();
  553. return false;
  554. }
  555. i++;
  556. continue;
  557. }
  558. /* It is currently not possible to use unreliable sentences.
  559. * The place where this can be influenced is the FeatureMaker,
  560. * in its setting ""
  561. if (args[i].equals("-reliableOnly")) { // optionally, request that only "reliable" sentences be used in selection
  562. considerOnlyReliableSentences = true;
  563. log.append("using only reliable sentences\n");
  564. System.out.println("using only reliable sentences");
  565. i++;
  566. continue;
  567. }
  568. */
  569. i++;
  570. }
  571. System.out.println();
  572. if (numEssentialArgs<6){
  573. //not all essential arguments were given
  574. System.out.println("You must at least specify locale, mysql (host,user,paswd,DB), selectedSentencesTableName");
  575. printUsage();
  576. return false;
  577. }
  578. if(selectedSentencesTableName==null){
  579. System.out.println("Please provide a name for the selectedSentencesTable.");
  580. printUsage();
  581. return false;
  582. }
  583. if( stopCriterion == null){
  584. stopCriterion = "numSentences 90 simpleDiphones simpleProsody";
  585. }
  586. if (selectionDirName == null){
  587. selectionDirName = currentDir + "/selection/";
  588. }
  589. if (initFileName == null){
  590. initFileName = currentDir + "/init.bin";
  591. }
  592. if (overallLogFile == null ){
  593. overallLogFile = currentDir + "/overallLog.txt";
  594. }
  595. if(featDefFileName == null){
  596. // check first if there exists one in the current directory
  597. // if not ask the user to provide one, it should have been automatically generated by the FeatureMaker in previous step
  598. // See: http://mary.opendfki.de/wiki/NewLanguageSupport step 5
  599. System.out.println("Checking if there is [locale]_featureDefinition.txt in the current directory");
  600. File feaDef = new File(currentDir + "/" + locale + "_featureDefinition.txt");
  601. if( feaDef.exists() ){
  602. System.out.println("Using " + locale + "_featureDefinition.txt in current directory." );
  603. featDefFileName = currentDir + "/" + locale + "_featureDefinition.txt";
  604. }
  605. else
  606. System.out.println("Please provide a [locale]_featureDefinition.txt, it should have been generated by the FeatureMaker. \n" +
  607. " See: http://mary.opendfki.de/wiki/NewLanguageSupport step 5.");
  608. }
  609. if (covDefConfigFileName == null){
  610. // check if there is already a covDef.config file in the current directory
  611. // if not then copy the default covDef.config from jar archive resource (marytts/tools/dbselection/covDef.config)
  612. System.out.println("\nChecking if there is already a covDef.config in the current directory");
  613. File covDef = new File(currentDir + "/covDef.config");
  614. if( covDef.exists() )
  615. System.out.println("Using covDef.config in current directory." );
  616. else
  617. {
  618. System.out.println("Copying default covDef.config file from archive" );
  619. FileUtils.copyInputStreamToFile(DatabaseSelector.class.getResourceAsStream("covDef.config"), covDef);
  620. }
  621. covDefConfigFileName = currentDir + "/covDef.config";
  622. System.out.println("covDefConfigFileName = " + covDefConfigFileName);
  623. }
  624. return true;
  625. }
  626. /**
  627. * Print usage of main method
  628. * to standard out
  629. */
  630. private static void printUsage(){
  631. System.out.println("\nUsage: " +
  632. "Usage: java DatabaseSelector -locale language -mysqlHost host -mysqlUser user -mysqlPasswd passwd -mysqlDB wikiDB\n"
  633. +"-tableName selectedSentencesTableName \n"
  634. +" [-stop stopCriterion]\n"
  635. +" [-featDef file -coverageConfig file]\n"
  636. +" [-initFile file -selectedSentences file -unwantedSentences file ]\n"
  637. +" [-tableDescription a brief description of the table ]\n"
  638. +" [-vectorsOnDisk -overallLog file -selectionDir dir -logCoverageDevelopment -verbose]\n"
  639. +" Arguments:\n"
  640. +" -tableName selectedSentencesTableName : The name of a new selection set, change this name when\n"
  641. +" generating several selection sets. FINAL name will be: \"locale_name_selectedSenteces\". \n"
  642. +" where name is the name provided for the selected sentences table.\n"
  643. +" -tableDescription : short description of the selected sentences table.\n"
  644. +" Default: empty\n"
  645. +" -featDef file : The feature definition for the features\n"
  646. +" Default: [locale]_featureDefinition.txt for example for US English: en_US_featureDefinition.txt\n"
  647. +" this file is automatically created in previous steps by the FeatureMaker.\n"
  648. +" -stop stopCriterion : which stop criterion to use. There are five stop criteria. \n"
  649. +" They can be used individually or can be combined:\n"
  650. +" - numSentences n : selection stops after n sentences\n"
  651. +" - simpleDiphones : selection stops when simple diphone coverage has reached maximum\n"
  652. +" - simpleProsody : selection stops when simple prosody coverage has reached maximum\n"
  653. +" Default: \"numSentences 90 simpleDiphones simpleProsody\"\n"
  654. +" -coverageConfig file : The config file for the coverage definition. \n"
  655. +" Default: there is a default coverage config file in MARY_BASE/resources/marytts/tools/dbselection/covDef.config\n"
  656. +" this file will be copied to the current directory if no file is provided.\n"
  657. +" -initFile file : The file containing the coverage data needed to initialise the algorithm.\n"
  658. +" Default: /current_dir/init.bin\n"
  659. +" -overallLog file : Log file for all runs of the program: date, settings and results of the current\n"
  660. +" run are appended to the end of the file. This file is needed if you want to analyse your results \n"
  661. +" with the ResultAnalyser later.\n"
  662. +" -selectionDir dir : the directory where all selection data is stored.\n"
  663. +" Default: /current_dir/selection\n"
  664. +" -vectorsOnDisk: if this option is given, the feature vectors are not loaded into memory during\n"
  665. +" the run of the program. This notably slows down the run of the program!\n"
  666. +" Default: no vectorsOnDisk\n"
  667. +" -logCoverageDevelopment : If this option is given, the coverage development over time is stored.\n"
  668. +" Default: no logCoverageDevelopment\n"
  669. +" -verbose : If this option is given, there will be more output on the command line during the run of the program.\n"
  670. +" Default: no verbose\n");
  671. }
  672. /***
  673. * Manual selection of wanted/unwanted selected sentences
  674. *
  675. */
  676. private static void checkSelectedSentences(){
  677. InputStreamReader isr = new InputStreamReader(System.in);
  678. BufferedReader br = new BufferedReader(isr);
  679. try{
  680. System.out.println("\nChecking selected sentences whether they are wanted or not.");
  681. System.out.println(" selected sentences will be saved in ./selected.log");
  682. PrintWriter selectedLog = new PrintWriter(new FileWriter(new File("./selected.log")));
  683. System.out.println(" selected sentences and transcriptions will be saved in ./selected_text_transcription.log");
  684. PrintWriter selected_tra_Log = new PrintWriter(new FileWriter(new File("./selected_text_transcription.log")));
  685. System.out.println(" unwanted sentences will be saved in ./unwanted.log");
  686. PrintWriter unwantedLog = new PrintWriter(new FileWriter(new File("./unwanted.log")));
  687. int sel[] = wikiToDB.getIdListOfType("dbselection", "selected=true and unwanted=false");
  688. if( sel != null){
  689. // checking selected sentences
  690. System.out.println(" Select \"y\" for marking sentence as \"wanted\" otherwise \"n\" . Press any other key to finish: \n");
  691. String str;
  692. for(int i=0; i<sel.length; i++){
  693. str = wikiToDB.getSelectedSentence(wikiToDB.getSelectedSentencesTableName(), sel[i]);
  694. System.out.print("id=" + sel[i] + ": "+ str + "\n Wanted?(y/n):");
  695. String s = br.readLine();
  696. if( s.contentEquals("n")){
  697. wikiToDB.setSentenceRecord(sel[i], "unwanted", true);
  698. unwantedLog.println(sel[i] + " " + str);
  699. } else if( s.contentEquals("y")){
  700. selectedLog.println(sel[i] + " " + str);
  701. selected_tra_Log.println(sel[i] + " " + str);
  702. selected_tra_Log.println(sel[i] + " <" + SelectionFunction.transcribe(str,locale) + ">");
  703. } else{
  704. unwantedLog.close();
  705. selectedLog.close();
  706. selected_tra_Log.close();
  707. break;
  708. }
  709. }
  710. } else
  711. System.out.println("There is no selected sentences in the DB.");
  712. } catch(Exception e){
  713. System.out.println(e);
  714. }
  715. }
  716. /**
  717. * Add a list of sentences to the cover
  718. * Here the already selected sentences are added to the cover and the indexes removed
  719. * (or set to -1) in the idSentenceList
  720. * @param covDef the cover
  721. * @throws Exception
  722. */
  723. private static void addSelectedSents(String tableName, CoverageDefinition covDef)throws Exception{
  724. if (verbose)
  725. System.out.println("\nAdding previously selected sentences ...");
  726. int idSentenceListSelected[] = wikiToDB.getIdListOfSelectedSentences(
  727. wikiToDB.getSelectedSentencesTableName(), "unwanted=false");
  728. int id;
  729. byte[] vectorBuf;
  730. if( idSentenceListSelected != null ){
  731. for(int i=0; i<idSentenceListSelected.length; i++){
  732. id = idSentenceListSelected[i];
  733. vectorBuf = wikiToDB.getFeatures(id);
  734. //fill the cover set with the sentence
  735. covDef.updateCover(vectorBuf);
  736. //add the filename to the sentence list
  737. selectedIdSents.add((Integer)id);
  738. }
  739. /*
  740. int numSelectedSents = selectedIdSents.size();
  741. int numRemovedSents = 0;
  742. //loop over basename array
  743. // No need to mark id negative
  744. for (int i=0;i<idSentenceList.length;i++){
  745. if (selectedIdSents.contains(idSentenceList[i])){
  746. //remove the sentence also from the idSentenceList
  747. if (verbose)
  748. System.out.println(" Removing from idSentenceList id=" + idSentenceList[i]);
  749. idSentenceList[i] = -1;
  750. numRemovedSents++;
  751. }
  752. if (numSelectedSents == numRemovedSents) break;
  753. }
  754. */
  755. System.out.println("Added to cover " + idSentenceListSelected.length + " selected sentences");
  756. } else
  757. System.out.println("There is no already selected sentences to add to the list.");
  758. }
  759. /**
  760. * Remove unwanted sentences from the basename list
  761. *
  762. * @throws Exception
  763. */
  764. private static void removeUnwantedSentences(String tableName) throws Exception{
  765. if (verbose)
  766. System.out.println("\nRemoving unwanted sentences ...");
  767. int idSentenceListUnwanted[] = wikiToDB.getIdListOfSelectedSentences(
  768. wikiToDB.getSelectedSentencesTableName(), "unwanted=true");
  769. int id;
  770. if( idSentenceListUnwanted != null ){
  771. for(int i=0; i<idSentenceListUnwanted.length; i++){
  772. id = idSentenceListUnwanted[i];
  773. // mark sentence as unwanted in the locale_dbselection table
  774. // this is already done when selecting unwanted with the SynthesisScriptGUI
  775. //wikiToDB.setSentenceRecord(id, "unwanted", true);
  776. unwantedIdSents.add((Integer)id);
  777. }
  778. /*
  779. // remove sentences from basename list
  780. int numSelectedSents = unwantedIdSents.size();
  781. int numRemovedSents = 0;
  782. // loop over basename array
  783. for (int i=0;i<idSentenceList.length;i++){
  784. if (unwantedIdSents.contains(idSentenceList[i])){
  785. //remove the sentence also from the idSentenceList
  786. if (verbose)
  787. System.out.println(" Removing (unwanted)from idSentenceList id=" + idSentenceList[i]);
  788. idSentenceList[i] = -1;
  789. numRemovedSents++;
  790. }
  791. if (numSelectedSents == numRemovedSents) break;
  792. }
  793. */
  794. System.out.println("Removed " + idSentenceListUnwanted.length + " unwanted sentences.");
  795. } else
  796. System.out.println("There is no unwanted sentences to remove.");
  797. }
  798. /**
  799. * Print the list of selected files
  800. *
  801. * @param filename the file to print to
  802. * @param selected the list of files
  803. */
  804. private static void storeResult(String filename, Set<Integer> selected){
  805. PrintWriter out;
  806. try{
  807. out = new PrintWriter(new FileWriter(new File(filename)));
  808. } catch (Exception e){
  809. e.printStackTrace();
  810. throw new Error("Error storing result");
  811. }
  812. for (int sel : selected) {
  813. out.println(sel);
  814. }
  815. out.flush();
  816. out.close();
  817. }
  818. }