PageRenderTime 50ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/sphinxbase-0.7/src/sphinx_adtools/cont_fileseg.c

#
C | 542 lines | 374 code | 48 blank | 120 comment | 146 complexity | 187746e1640655bf132059d4ba860621 MD5 | raw file
Possible License(s): Apache-2.0, CC-BY-SA-3.0, BSD-3-Clause, LGPL-2.0, BSD-3-Clause-No-Nuclear-License-2014
  1. /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2. /* ====================================================================
  3. * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
  4. * reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions
  8. * are met:
  9. *
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. *
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in
  15. * the documentation and/or other materials provided with the
  16. * distribution.
  17. *
  18. * This work was supported in part by funding from the Defense Advanced
  19. * Research Projects Agency and the National Science Foundation of the
  20. * United States of America, and the CMU Sphinx Speech Consortium.
  21. *
  22. * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  23. * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  24. * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  25. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  26. * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33. *
  34. * ====================================================================
  35. *
  36. */
  37. /*
  38. * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances.
  39. *
  40. * HISTORY
  41. *
  42. * $Log: cont_fileseg.c,v $
  43. * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
  44. * re-importation
  45. *
  46. * Revision 1.13 2005/06/30 00:28:46 rkm
  47. * Kept within-utterance silences in rawmode
  48. *
  49. *
  50. * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  51. * Modified to use new state variables in cont_ad_t.
  52. *
  53. * Revision 1.12 2005/05/31 15:54:38 rkm
  54. * *** empty log message ***
  55. *
  56. * Revision 1.11 2005/05/24 20:56:58 rkm
  57. * Added min/max-noise parameters to cont_fileseg
  58. *
  59. * Revision 1.10 2005/05/13 23:28:43 egouvea
  60. * Changed null device to system dependent one: NUL for windows, /dev/null for everything else
  61. *
  62. * $Log: cont_fileseg.c,v $
  63. * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
  64. * re-importation
  65. *
  66. * Revision 1.13 2005/06/30 00:28:46 rkm
  67. * Kept within-utterance silences in rawmode
  68. *
  69. * Revision 1.12 2005/05/31 15:54:38 rkm
  70. * *** empty log message ***
  71. *
  72. * Revision 1.11 2005/05/24 20:56:58 rkm
  73. * Added min/max-noise parameters to cont_fileseg
  74. *
  75. * Revision 1.9 2005/02/13 01:29:48 rkm
  76. * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
  77. *
  78. * Revision 1.8 2005/02/01 22:21:13 rkm
  79. * Added raw data logging, and raw data pass-through mode to cont_ad
  80. *
  81. * Revision 1.7 2004/07/16 00:57:11 egouvea
  82. * Added Ravi's implementation of FSG support.
  83. *
  84. * Revision 1.3 2004/06/25 14:58:05 rkm
  85. * *** empty log message ***
  86. *
  87. * Revision 1.2 2004/06/23 20:32:08 rkm
  88. * Exposed several cont_ad config parameters
  89. *
  90. *
  91. * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
  92. * Created.
  93. */
  94. #include <stdio.h>
  95. #include <stdlib.h>
  96. #include <string.h>
  97. #include <assert.h>
  98. #include <math.h>
  99. #include <sphinxbase/prim_type.h>
  100. #include <sphinxbase/ad.h>
  101. #include <sphinxbase/cont_ad.h>
  102. #include <sphinxbase/err.h>
  103. static FILE *infp; /* File being segmented */
  104. static int32 swap;
  105. /* Max size read by file_ad_read function on each invocation, for debugging */
  106. static int32 max_ad_read_size;
  107. #if defined(WIN32) && !defined(GNUWINCE)
  108. #define NULL_DEVICE "NUL"
  109. #else
  110. #define NULL_DEVICE "/dev/null"
  111. #endif
  112. /*
  113. * Need to provide cont_ad_init with a read function to read the input file.
  114. * This is it. The ad_rec_t *r argument is ignored since there is no A/D
  115. * device involved.
  116. */
  117. static int32
  118. file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
  119. {
  120. int32 i, k;
  121. if (max > max_ad_read_size)
  122. max = max_ad_read_size;
  123. k = fread(buf, sizeof(int16), max, infp);
  124. if (swap) {
  125. for (i = 0; i < k; i++) {
  126. buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
  127. }
  128. }
  129. return ((k > 0) ? k : -1);
  130. }
  131. static void
  132. usagemsg(char *pgm)
  133. {
  134. E_INFO("Usage: %s \\\n", pgm);
  135. E_INFOCONT("\t[-? | -h] \\\n");
  136. E_INFOCONT("\t[-d | -debug] \\\n");
  137. E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
  138. E_INFOCONT("\t[-b | -byteswap] \\\n");
  139. E_INFOCONT
  140. ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
  141. E_INFOCONT("\t[-w | -writeseg] \\\n");
  142. E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
  143. E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
  144. E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
  145. E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
  146. E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
  147. E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
  148. E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
  149. E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
  150. E_INFOCONT("\t[-c <copy-input-file>] \\\n");
  151. E_INFOCONT("\t[-r | -rawmode] \\\n");
  152. E_INFOCONT("\t-i <input-file>\n");
  153. exit(0);
  154. }
  155. /*
  156. * Read specified input file, segment it into utterances wherever a silence segment of
  157. * a given minimum duration is encountered. Filter out long silences.
  158. * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc.
  159. */
  160. int
  161. main(int32 argc, char **argv)
  162. {
  163. cont_ad_t *cont;
  164. int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
  165. int16 buf[4096];
  166. char *infile, *copyfile, segfile[1024];
  167. FILE *fp;
  168. float endsil;
  169. ad_rec_t ad;
  170. int32 i, k;
  171. int32 winsize, leader, trailer;
  172. int32 orig_min_noise, orig_max_noise;
  173. int32 orig_delta_sil, orig_delta_speech;
  174. int32 orig_speech_onset, orig_sil_onset;
  175. int32 min_noise, max_noise;
  176. int32 delta_sil, delta_speech;
  177. int32 sil_onset, speech_onset;
  178. float32 orig_adapt_rate;
  179. float32 adapt_rate;
  180. int32 total_speech_samples;
  181. float32 total_speech_sec;
  182. FILE *rawfp;
  183. /* Set argument defaults */
  184. cont = NULL;
  185. sps = 16000;
  186. swap = 0;
  187. endsil = 0.5;
  188. writeseg = 0;
  189. min_noise = max_noise = -1;
  190. delta_sil = delta_speech = -1;
  191. sil_onset = speech_onset = -1;
  192. adapt_rate = -1.0;
  193. max_ad_read_size = (int32) 0x7ffffff0;
  194. debug = 0;
  195. infile = NULL;
  196. copyfile = NULL;
  197. rawfp = NULL;
  198. rawmode = 0;
  199. /* Parse arguments */
  200. for (i = 1; i < argc; i++) {
  201. if ((strcmp(argv[i], "-help") == 0)
  202. || (strcmp(argv[i], "-h") == 0)
  203. || (strcmp(argv[i], "-?") == 0)) {
  204. usagemsg(argv[0]);
  205. }
  206. else if ((strcmp(argv[i], "-debug") == 0)
  207. || (strcmp(argv[i], "-d") == 0)) {
  208. debug = 1;
  209. }
  210. else if (strcmp(argv[i], "-sps") == 0) {
  211. i++;
  212. if ((i == argc)
  213. || (sscanf(argv[i], "%d", &sps) != 1)
  214. || (sps <= 0)) {
  215. E_ERROR("Invalid -sps argument\n");
  216. usagemsg(argv[0]);
  217. }
  218. }
  219. else if ((strcmp(argv[i], "-byteswap") == 0)
  220. || (strcmp(argv[i], "-b") == 0)) {
  221. swap = 1;
  222. }
  223. else if ((strcmp(argv[i], "-silsep") == 0)
  224. || (strcmp(argv[i], "-s") == 0)) {
  225. i++;
  226. if ((i == argc)
  227. || (sscanf(argv[i], "%f", &endsil) != 1)
  228. || (endsil <= 0.0)) {
  229. E_ERROR("Invalid -silsep argument\n");
  230. usagemsg(argv[0]);
  231. }
  232. }
  233. else if ((strcmp(argv[i], "-writeseg") == 0)
  234. || (strcmp(argv[i], "-w") == 0)) {
  235. writeseg = 1;
  236. }
  237. else if (strcmp(argv[i], "-min-noise") == 0) {
  238. i++;
  239. if ((i == argc) ||
  240. (sscanf(argv[i], "%d", &min_noise) != 1) ||
  241. (min_noise < 0)) {
  242. E_ERROR("Invalid -min-noise argument\n");
  243. usagemsg(argv[0]);
  244. }
  245. }
  246. else if (strcmp(argv[i], "-max-noise") == 0) {
  247. i++;
  248. if ((i == argc) ||
  249. (sscanf(argv[i], "%d", &max_noise) != 1) ||
  250. (max_noise < 0)) {
  251. E_ERROR("Invalid -max-noise argument\n");
  252. usagemsg(argv[0]);
  253. }
  254. }
  255. else if (strcmp(argv[i], "-delta-sil") == 0) {
  256. i++;
  257. if ((i == argc) ||
  258. (sscanf(argv[i], "%d", &delta_sil) != 1) ||
  259. (delta_sil < 0)) {
  260. E_ERROR("Invalid -delta-sil argument\n");
  261. usagemsg(argv[0]);
  262. }
  263. }
  264. else if (strcmp(argv[i], "-delta-speech") == 0) {
  265. i++;
  266. if ((i == argc) ||
  267. (sscanf(argv[i], "%d", &delta_speech) != 1) ||
  268. (delta_speech < 0)) {
  269. E_ERROR("Invalid -delta-speech argument\n");
  270. usagemsg(argv[0]);
  271. }
  272. }
  273. else if (strcmp(argv[i], "-sil-onset") == 0) {
  274. i++;
  275. if ((i == argc) ||
  276. (sscanf(argv[i], "%d", &sil_onset) != 1) ||
  277. (sil_onset < 1)) {
  278. E_ERROR("Invalid -sil-onset argument\n");
  279. usagemsg(argv[0]);
  280. }
  281. }
  282. else if (strcmp(argv[i], "-speech-onset") == 0) {
  283. i++;
  284. if ((i == argc) ||
  285. (sscanf(argv[i], "%d", &speech_onset) != 1) ||
  286. (speech_onset < 1)) {
  287. E_ERROR("Invalid -speech-onset argument\n");
  288. usagemsg(argv[0]);
  289. }
  290. }
  291. else if (strcmp(argv[i], "-adapt-rate") == 0) {
  292. i++;
  293. if ((i == argc) ||
  294. (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
  295. (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
  296. E_ERROR("Invalid -adapt-rate argument\n");
  297. usagemsg(argv[0]);
  298. }
  299. }
  300. else if (strcmp(argv[i], "-max-adreadsize") == 0) {
  301. i++;
  302. if ((i == argc) ||
  303. (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
  304. (max_ad_read_size < 1)) {
  305. E_ERROR("Invalid -max-adreadsize argument\n");
  306. usagemsg(argv[0]);
  307. }
  308. }
  309. else if (strcmp(argv[i], "-c") == 0) {
  310. i++;
  311. if (i == argc) {
  312. E_ERROR("Invalid -c argument\n");
  313. usagemsg(argv[0]);
  314. }
  315. copyfile = argv[i];
  316. }
  317. else if ((strcmp(argv[i], "-rawmode") == 0)
  318. || (strcmp(argv[i], "-r") == 0)) {
  319. rawmode = 1;
  320. }
  321. else if (strcmp(argv[i], "-i") == 0) {
  322. i++;
  323. if (i == argc) {
  324. E_ERROR("Invalid -i argument\n");
  325. usagemsg(argv[0]);
  326. }
  327. infile = argv[i];
  328. }
  329. else {
  330. usagemsg(argv[0]);
  331. }
  332. }
  333. if (infile == NULL) {
  334. E_ERROR("No input file specified\n");
  335. usagemsg(argv[0]);
  336. }
  337. if ((infp = fopen(infile, "rb")) == NULL)
  338. E_FATAL("Failed to open '%s' for reading: %s\n", infile, strerror(errno));
  339. /*
  340. * Associate continuous listening module with opened input file and read function.
  341. * No A/D device is involved, but need to fill in ad->sps.
  342. * Calibrate input data using first few seconds of file, but then rewind it!!
  343. */
  344. ad.sps = sps;
  345. ad.bps = sizeof(int16);
  346. if (!rawmode)
  347. cont = cont_ad_init(&ad, file_ad_read);
  348. else
  349. cont = cont_ad_init_rawmode(&ad, file_ad_read);
  350. printf("Calibrating ...");
  351. fflush(stdout);
  352. if (cont_ad_calib(cont) < 0)
  353. printf(" failed; file too short?\n");
  354. else
  355. printf(" done\n");
  356. rewind(infp);
  357. /* Convert desired min. inter-utterance silence duration to #samples */
  358. siltime = (int32) (endsil * sps);
  359. /* Enable writing raw input to output by the cont module if specified */
  360. if (copyfile) {
  361. if ((rawfp = fopen(copyfile, "wb")) == NULL)
  362. E_ERROR("Failed to open raw output file '%s' for writing: %s\n",
  363. copyfile, strerror(errno));
  364. else
  365. cont_ad_set_rawfp(cont, rawfp);
  366. }
  367. cont_ad_get_params(cont,
  368. &orig_delta_sil, &orig_delta_speech,
  369. &orig_min_noise, &orig_max_noise,
  370. &winsize,
  371. &orig_speech_onset, &orig_sil_onset,
  372. &leader, &trailer, &orig_adapt_rate);
  373. E_INFO("Default parameters:\n");
  374. E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
  375. orig_min_noise, orig_max_noise);
  376. E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
  377. orig_delta_sil, orig_delta_speech);
  378. E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
  379. orig_sil_onset, orig_speech_onset);
  380. E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
  381. if (min_noise < 0)
  382. min_noise = orig_min_noise;
  383. if (max_noise < 0)
  384. max_noise = orig_max_noise;
  385. if (delta_sil < 0)
  386. delta_sil = orig_delta_sil;
  387. if (delta_speech < 0)
  388. delta_speech = orig_delta_speech;
  389. if (sil_onset < 0)
  390. sil_onset = orig_sil_onset;
  391. if (speech_onset < 0)
  392. speech_onset = orig_speech_onset;
  393. if (adapt_rate < 0.0)
  394. adapt_rate = orig_adapt_rate;
  395. cont_ad_set_params(cont,
  396. delta_sil, delta_speech,
  397. min_noise, max_noise,
  398. winsize,
  399. speech_onset, sil_onset,
  400. leader, trailer, adapt_rate);
  401. E_INFO("Current parameters:\n");
  402. E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
  403. E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
  404. delta_speech);
  405. E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
  406. speech_onset);
  407. E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
  408. E_INFO("Sampling rate: %d", sps);
  409. E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
  410. E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
  411. if (debug)
  412. cont_ad_set_logfp(cont, stdout);
  413. total_speech_samples = 0;
  414. total_speech_sec = 0.0;
  415. uttid = 0;
  416. uttlen = 0;
  417. starttime = 0;
  418. fp = NULL;
  419. /* Process data */
  420. for (;;) {
  421. /* Get audio data from continuous listening module */
  422. k = cont_ad_read(cont, buf, 4096);
  423. if (k < 0) { /* End of input audio file; close any open output file and exit */
  424. if (fp != NULL) {
  425. fclose(fp);
  426. fp = NULL;
  427. printf
  428. ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
  429. uttid, (double) starttime / (double) sps,
  430. (double) (starttime + uttlen) / (double) sps,
  431. (double) uttlen / (double) sps, uttlen);
  432. fflush(stdout);
  433. total_speech_samples += uttlen;
  434. total_speech_sec += (double) uttlen / (double) sps;
  435. uttid++;
  436. }
  437. break;
  438. }
  439. if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */
  440. if (fp != NULL) { /* Currently in an utterance */
  441. if (cont->seglen > siltime) { /* Long enough silence detected; end the utterance */
  442. fclose(fp);
  443. fp = NULL;
  444. printf
  445. ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
  446. uttid, (double) starttime / (double) sps,
  447. (double) (starttime + uttlen) / (double) sps,
  448. (double) uttlen / (double) sps, uttlen);
  449. fflush(stdout);
  450. total_speech_samples += uttlen;
  451. total_speech_sec += (double) uttlen / (double) sps;
  452. uttid++;
  453. }
  454. else {
  455. /*
  456. * Short silence within utt; write it to output. (Some extra trailing silence
  457. * is included in the utterance, as a result. Not to worry about it.)
  458. */
  459. if (k > 0) {
  460. fwrite(buf, sizeof(int16), k, fp);
  461. uttlen += k;
  462. }
  463. }
  464. }
  465. }
  466. else {
  467. assert(cont->state == CONT_AD_STATE_SPEECH);
  468. if (fp == NULL) { /* Not in an utt; open a new output file */
  469. if (writeseg)
  470. sprintf(segfile, "%08d.raw", uttid);
  471. else
  472. strcpy(segfile, NULL_DEVICE);
  473. if ((fp = fopen(segfile, "wb")) == NULL)
  474. E_FATAL("Failed to open segmentation file '%s' for writing: %s\n", segfile, strerror(errno));
  475. starttime = cont->read_ts - k;
  476. uttlen = 0;
  477. }
  478. /* Write data obtained to output file */
  479. if (k > 0) {
  480. fwrite(buf, sizeof(int16), k, fp);
  481. uttlen += k;
  482. }
  483. }
  484. }
  485. if (rawfp)
  486. fclose(rawfp);
  487. E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
  488. cont->tot_frm, cont->tot_frm * cont->spf,
  489. (cont->tot_frm * cont->spf) / (float32) cont->sps);
  490. E_INFO("Total speech detected = %d samples, %.2f sec\n",
  491. total_speech_samples, total_speech_sec);
  492. cont_ad_close(cont);
  493. return 0;
  494. }