Edinburgh Speech Tools  2.4-release
wagon_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996-2006 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* */
42 /* Added vector support for Clustergen 2005/2006 */
43 /* */
44 /*=======================================================================*/
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Wagon.h"
50 #include "EST_cmd_line.h"
51 
52 enum wn_strategy_type {wn_decision_list, wn_decision_tree};
53 
54 static wn_strategy_type wagon_type = wn_decision_tree;
55 
56 static int wagon_main(int argc, char **argv);
57 
58 /** @name <command>wagon</command> <emphasis>CART building program</emphasis>
59  @id wagon_manual
60  * @toc
61  */
62 
63 //@{
64 
65 
66 /**@name Synopsis
67  */
68 //@{
69 
70 //@synopsis
71 
72 /**
73 wagon is used to build CART tress from feature data, its basic
74 features include:
75 
76 <itemizedlist>
77 <listitem><para>both decisions trees and decision lists are supported</para></listitem>
78 <listitem><para>predictees can be discrete or continuous</para></listitem>
79 <listitem><para>input features may be discrete or continuous</para></listitem>
80 <listitem><para>many options for controlling tree building</para>
81 <itemizedlist>
82 <listitem><para>fixed stop value</para></listitem>
83 <listitem><para>balancing</para></listitem>
84 <listitem><para>held-out data and pruning</para></listitem>
85 <listitem><para>stepwise use of input features</para></listitem>
86 <listitem><para>choice of optimization criteria correct/entropy (for
87 classification and rmse/correlation (for regression)</para></listitem>
88 </itemizedlist>
89 </listitem>
90 </itemizedlist>
91 
92 A detailed description of building CART models can be found in the
93 <link linkend="cart-overview">CART model overview</link> section.
94 
95 */
96 
97 //@}
98 
99 /**@name OPTIONS
100  */
101 //@{
102 
103 //@options
104 
105 //@}
106 
107 int main(int argc, char **argv)
108 {
109 
110  wagon_main(argc,argv);
111 
112  exit(0);
113  return 0;
114 }
115 
116 static int set_Vertex_Feats(EST_Track &wgn_VertexFeats,
117  EST_String &wagon_track_features)
118 {
119  int i,s=0,e;
120  EST_TokenStream ts;
121 
122  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
123  wgn_VertexFeats.a(0,i) = 0.0;
124 
125  ts.open_string(wagon_track_features);
126  ts.set_WhiteSpaceChars(",- ");
127  ts.set_PunctuationSymbols("");
129  ts.set_SingleCharSymbols("");
130 
131  while (!ts.eof())
132  {
133  EST_Token &token = ts.get();
134  const EST_String ws = (const char *)token.whitespace();
135  if (token == "all")
136  {
137  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
138  wgn_VertexFeats.a(0,i) = 1.0;
139  break;
140  } else if ((ws == ",") || (ws == ""))
141  {
142  s = atoi(token.string());
143  wgn_VertexFeats.a(0,s) = 1.0;
144  } else if (ws == "-")
145  {
146  if (token == "")
147  e = wgn_VertexFeats.num_channels()-1;
148  else
149  e = atoi(token.string());
150  for (i=s; i<=e && i<wgn_VertexFeats.num_channels(); i++)
151  wgn_VertexFeats.a(0,i) = 1.0;
152  } else
153  {
154  printf("wagon: track_feats invalid: %s at position %d\n",
155  (const char *)wagon_track_features,
156  ts.filepos());
157  exit(-1);
158  }
159  }
160 
161  return 0;
162 }
163 
164 static int wagon_main(int argc, char **argv)
165 {
166  // Top level function sets up data and creates a tree
167  EST_Option al;
168  EST_StrList files;
169  EST_String wgn_oname;
170  ostream *wgn_coutput = 0;
171  float stepwise_limit = 0;
172  int feats_start=0, feats_end=0;
173  int i;
174 
175  parse_command_line
176  (argc, argv,
177  EST_String("[options]\n") +
178  "Summary: CART building program\n"+
179  "-desc <ifile> Field description file\n"+
180  "-data <ifile> Datafile, one vector per line\n"+
181  "-stop <int> {50} Minimum number of examples for leaf nodes\n"+
182  "-test <ifile> Datafile to test tree on\n"+
183  "-frs <float> {10} Float range split, number of partitions to\n"+
184  " split a float feature range into\n"+
185  "-dlist Build a decision list (rather than tree)\n"+
186  "-dtree Build a decision tree (rather than list) default\n"+
187  "-output <ofile> \n"+
188  "-o <ofile> File to save output tree in\n"+
189  "-distmatrix <ifile>\n"+
190  " A distance matrix for clustering\n"+
191  "-track <ifile>\n"+
192  " track for vertex indices\n"+
193  "-track_start <int>\n"+
194  " start channel vertex indices\n"+
195  "-track_end <int>\n"+
196  " end (inclusive) channel for vertex indices\n"+
197  "-track_feats <string>\n"+
198  " Track features to use, comma separated list\n"+
199  " with feature numbers and/or ranges, 0 start\n"+
200  "-unittrack <ifile>\n"+
201  " track for unit start and length in vertex track\n"+
202  "-quiet No questions printed during building\n"+
203  "-verbose Lost of information printing during build\n"+
204  "-predictee <string>\n"+
205  " name of field to predict (default is first field)\n"+
206  "-ignore <string>\n"+
207  " Filename or bracket list of fields to ignore\n"+
208  "-count_field <string>\n"+
209  " Name of field containing count weight for samples\n"+
210  "-stepwise Incrementally find best features\n"+
211  "-swlimit <float> {0.0}\n"+
212  " Percentage necessary improvement for stepwise,\n"+
213  " may be negative.\n"+
214  "-swopt <string> Parameter to optimize for stepwise, for \n"+
215  " classification options are correct or entropy\n"+
216  " for regression options are rmse or correlation\n"+
217  " correct and correlation are the defaults\n"+
218  "-balance <float> For derived stop size, if dataset at node, divided\n"+
219  " by balance is greater than stop it is used as stop\n"+
220  " if balance is 0 (default) always use stop as is.\n"+
221  "-cos Use mean cosine distance rather than Gaussian (TBD).\n"+
222  "-dof <float> Randomly dropout feats in training (prob).\n"+
223  "-dos <float> Randomly dropout samples in training (prob).\n"+
224  "-vertex_output <string> Output <mean> or <best> of cluster\n"+
225  "-held_out <int> Percent to hold out for pruning\n"+
226  "-max_questions <int> Maximum number of questions in tree\n"+
227  "-heap <int> {210000}\n"+
228  " Set size of Lisp heap, should not normally need\n"+
229  " to be changed from its default, only with *very*\n"+
230  " large description files (> 1M)\n"+
231  "-omp_nthreads <int> {1}\n"+
232  " Set number of OMP threads to run wagon in\n"+
233  " tree building; this overrides $OMP_NUM_THREADS\n"+
234  " (ignored if not supported)\n"+
235  "-noprune No (same class) pruning required\n",
236  files, al);
237 
238  if (al.present("-held_out"))
239  wgn_held_out = al.ival("-held_out");
240  if (al.present("-dof"))
241  wgn_dropout_feats = al.fval("-dof");
242  if (al.present("-dos"))
243  wgn_dropout_samples = al.fval("-dos");
244  if (al.present("-cos"))
245  wgn_cos = al.ival("-cos");
246  if (al.present("-balance"))
247  wgn_balance = al.fval("-balance");
248  if ((!al.present("-desc")) || ((!al.present("-data"))))
249  {
250  cerr << argv[0] << ": missing description and/or datafile" << endl;
251  cerr << "use -h for description of arguments" << endl;
252  }
253 
254  if (al.present("-quiet"))
255  wgn_quiet = TRUE;
256  if (al.present("-verbose"))
257  wgn_verbose = TRUE;
258 
259  if (al.present("-stop"))
260  wgn_min_cluster_size = atoi(al.val("-stop"));
261  if (al.present("-max_questions"))
262  wgn_max_questions = atoi(al.val("-max_questions"));
263  if (al.present("-noprune"))
264  wgn_prune = FALSE;
265  if (al.present("-predictee"))
266  wgn_predictee_name = al.val("-predictee");
267  if (al.present("-count_field"))
268  wgn_count_field_name = al.val("-count_field");
269  if (al.present("-swlimit"))
270  stepwise_limit = al.fval("-swlimit");
271  if (al.present("-frs")) // number of partitions to try in floats
272  wgn_float_range_split = atof(al.val("-frs"));
273  if (al.present("-swopt"))
274  wgn_opt_param = al.val("-swopt");
275  if (al.present("-vertex_output"))
276  wgn_vertex_output = al.val("-vertex_output");
277  if (al.present("-output") || al.present("-o"))
278  {
279  if (al.present("-o"))
280  wgn_oname = al.val("-o");
281  else
282  wgn_oname = al.val("-output");
283  wgn_coutput = new ofstream(wgn_oname);
284  if (!(*wgn_coutput))
285  {
286  cerr << "Wagon: can't open file \"" << wgn_oname <<
287  "\" for output " << endl;
288  exit(-1);
289  }
290  }
291  else
292  wgn_coutput = &cout;
293  if (al.present("-distmatrix"))
294  {
295  if (wgn_DistMatrix.load(al.val("-distmatrix")) != 0)
296  {
297  cerr << "Wagon: failed to load Distance Matrix from \"" <<
298  al.val("-distmatrix") << "\"\n" << endl;
299  exit(-1);
300  }
301  }
302  if (al.present("-dlist"))
303  wagon_type = wn_decision_list;
304 
305  WNode *tree;
306  float score;
307  LISP ignores = NIL;
308 
309  siod_init(al.ival("-heap"));
310 
311  if (al.present("-ignore"))
312  {
313  EST_String ig = (const char *)al.sval("-ignore");
314  if (ig[0] == '(')
315  ignores = read_from_string(ig);
316  else
317  ignores = vload(ig,1);
318  }
319  // Load in the data
320  wgn_load_datadescription(al.val("-desc"),ignores);
321  wgn_load_dataset(wgn_dataset,al.val("-data"));
322  if (al.present("-distmatrix") &&
323  (wgn_DistMatrix.num_rows() < wgn_dataset.length()))
324  {
325  cerr << "wagon: distance matrix is smaller than number of training elements\n";
326  exit(-1);
327  }
328  else if (al.present("-track"))
329  {
330  wgn_VertexTrack.load(al.val("-track"));
331  wgn_VertexFeats.resize(1,wgn_VertexTrack.num_channels());
332  for (i=0; i<wgn_VertexFeats.num_channels(); i++)
333  wgn_VertexFeats.a(0,i) = 1.0;
334  }
335 
336  if (al.present("-track_start"))
337  {
338  feats_start = al.ival("-track_start");
339  if ((feats_start < 0) ||
340  (feats_start > wgn_VertexTrack.num_channels()))
341  {
342  printf("wagon: track_start invalid: %d out of %d channels\n",
343  feats_start,
344  wgn_VertexTrack.num_channels());
345  exit(-1);
346  }
347  for (i=0; i<feats_start; i++)
348  wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats up to start */
349 
350  }
351 
352  if (al.present("-track_end"))
353  {
354  feats_end = al.ival("-track_end");
355  if ((feats_end < feats_start) ||
356  (feats_end > wgn_VertexTrack.num_channels()))
357  {
358  printf("wagon: track_end invalid: %d between start %d out of %d channels\n",
359  feats_end,
360  feats_start,
361  wgn_VertexTrack.num_channels());
362  exit(-1);
363  }
364  for (i=feats_end+1; i<wgn_VertexTrack.num_channels(); i++)
365  wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats after end */
366  }
367  if (al.present("-track_feats"))
368  { /* overrides start and end numbers */
369  EST_String wagon_track_features = (const char *)al.val("-track_feats");
370  set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
371  }
372 
373  // printf("Track feats\n");
374  // for (i=0; i<wgn_VertexTrack.num_channels(); i++)
375  // if (wgn_VertexFeats.a(0,i) > 0.0)
376  // printf("%d ",i);
377  // printf("\n");
378 
379  if (al.present("-unittrack"))
380  { /* contains two features, a start and length. start indexes */
381  /* into VertexTrack to the first vector in the segment */
382  wgn_UnitTrack.load(al.val("-unittrack"));
383  }
384 
385 #ifdef OMP_WAGON
386  if (al.present ("-omp_nthreads"))
387  {
388  omp_set_num_threads(atoi(al.val("-omp_nthreads")));
389  }else{
390  omp_set_num_threads(1);
391  }
392 #else
393  if (al.present ("-omp_nthreads"))
394  {
395  printf("wagon: -omp_nthreads ignored: not supported in this build.\n");
396  }
397 #endif
398 
399  if (al.present("-test"))
400  wgn_load_dataset(wgn_test_dataset,al.val("-test"));
401 
402  // Build and test the model
403  if (al.present("-stepwise"))
404  tree = wagon_stepwise(stepwise_limit);
405  else if (wagon_type == wn_decision_tree)
406  tree = wgn_build_tree(score); // default operation
407  else if (wagon_type == wn_decision_list)
408  // dlist is printed with build_dlist rather than returned
409  tree = wgn_build_dlist(score,wgn_coutput);
410  else
411  {
412  cerr << "Wagon: unknown operation, not tree or list" << endl;
413  exit(-1);
414  }
415 
416  if (tree != 0)
417  {
418  *wgn_coutput << *tree;
419  summary_results(*tree,wgn_coutput);
420  }
421 
422  if (wgn_coutput != &cout)
423  delete wgn_coutput;
424  return 0;
425 }
426 
427 /** @name Building Trees
428 
429 To build a decision tree (or list) Wagon requires data and a description
430 of it. A data file consists a set of samples, one per line each
431 consisting of the same set of features. Features may be categorial
432 or continuous. By default the first feature is the predictee and the
433 others are used as predictors. A typical data file will look like
434 this
435 </para>
436 <para>
437 <screen>
438 0.399 pau sh 0 0 0 1 1 0 0 0 0 0 0
439 0.082 sh iy pau onset 0 1 0 0 1 1 0 0 1
440 0.074 iy hh sh coda 1 0 1 0 1 1 0 0 1
441 0.048 hh ae iy onset 0 1 0 1 1 1 0 1 1
442 0.062 ae d hh coda 1 0 0 1 1 1 0 1 1
443 0.020 d y ae coda 2 0 1 1 1 1 0 1 1
444 0.082 y ax d onset 0 1 0 1 1 1 1 1 1
445 0.082 ax r y coda 1 0 0 1 1 1 1 1 1
446 0.036 r d ax coda 2 0 1 1 1 1 1 1 1
447 ...
448 </screen>
449 </para>
450 <para>
451 The data may come from any source, such as the festival script
452 dumpfeats which allows the creation of such files easily from utterance
453 files.
454 </para><para>
455 In addition to a data file a description file is also require that
456 gives a name and a type to each of the features in the datafile.
457 For the above example it would look like
458 </para><para>
459 <screen>
460 ((segment_duration float)
461  ( name aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
462  hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
463  ( n.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
464  hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
465  ( p.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
466  hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
467  (position_type 0 onset coda)
468  (pos_in_syl float)
469  (syl_initial 0 1)
470  (syl_final 0 1)
471  (R:Sylstructure.parent.R:Syllable.p.syl_break float)
472  (R:Sylstructure.parent.syl_break float)
473  (R:Sylstructure.parent.R:Syllable.n.syl_break float)
474  (R:Sylstructure.parent.R:Syllable.p.stress 0 1)
475  (R:Sylstructure.parent.stress 0 1)
476  (R:Sylstructure.parent.R:Syllable.n.stress 0 1)
477 )
478 </screen>
479 </para><para>
480 The feature names are arbitrary, but as they appear in the generated
481 trees is most useful if the trees are to be used in prediction of
482 an utterance that the names are features and/or pathnames.
483 </para><para>
484 Wagon can be used to build a tree with such files with the command
485 <screen>
486 wagon -data feats.data -desc fest.desc -stop 10 -output feats.tree
487 </screen>
488 A test data set may also be given which must match the given data description.
489 If specified the built tree will be tested on the test set and results
490 on that will be presented on completion, without a test set the
491 results are given with respect to the training data. However in
492 stepwise case the test set is used in the multi-level training process
493 thus it cannot be considered as true test data and more reasonable
494 results should found on applying the generate tree to truly
495 held out data (via the program wagon_test).
496 
497 */
498 
499 //@}
EST_Option
Definition: EST_Option.h:50
EST_TokenStream::eof
int eof()
end of file
Definition: EST_Token.h:356
EST_TList
Definition: EST_TList.h:59
EST_Track::load
EST_read_status load(const EST_String name, float ishift=0.0, float startt=0.0)
Definition: EST_Track.cc:1309
EST_Track::resize
void resize(int num_frames, int num_channels, bool preserve=1)
Definition: EST_Track.cc:211
EST_TokenStream
Definition: EST_Token.h:235
EST_TMatrix::num_rows
int num_rows() const
return number of rows
Definition: EST_TMatrix.h:179
EST_Track::a
float & a(int i, int c=0)
Definition: EST_Track.cc:1022
EST_Track::num_channels
int num_channels() const
return number of channels in track
Definition: EST_Track.h:656
EST_TKVL::present
const int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
EST_TokenStream::filepos
int filepos(void) const
current file position in \Ref{EST_TokenStream}
Definition: EST_Token.h:361
EST_Option::fval
float fval(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:98
EST_Token
Definition: EST_Token.h:73
EST_TokenStream::set_SingleCharSymbols
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:338
EST_TokenStream::set_PrePunctuationSymbols
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:344
EST_TokenStream::open_string
int open_string(const EST_String &newbuffer)
open a \Ref{EST_TokenStream} for string rather than a file
Definition: EST_Token.cc:251
EST_TokenStream::get
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:486
EST_String
Definition: EST_String.h:70
EST_TokenStream::set_WhiteSpaceChars
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:335
EST_TokenStream::set_PunctuationSymbols
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:341
EST_Track
Definition: EST_Track.h:89
EST_FMatrix::load
EST_read_status load(const EST_String &filename)
Load from file (ascii or binary as defined in file)
Definition: EST_FMatrix.cc:513
EST_Option::ival
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:76
WNode
Definition: EST_Wagon.h:222
EST_Option::sval
const EST_String & sval(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:87
EST_TKVL::val
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145