Logo Search packages:      
Sourcecode: octave-audio version File versions  Download package

aurecord.cc

/*
 * HISTORY:
 *    May, 1999 - separate audio open/close from wave play
 *    Feb. 1999 - first public release.
 *
 * Copyright 1999 Paul Kienzle <pkienzle@users.sf.net>
 * This source code is freely redistributable and may be used for
 * any purpose.  This copyright notice must be maintained. 
 * Paul Kienzle is not responsible for the consequences of using
 * this software.
## TODO: Support SGI, Sun and Windows devices
## TODO: Clean up user interaction, possibly adding GUI support
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <signal.h>
#include "endpoint.h" 


#ifdef TEST
#include <stdarg.h>

void mymessage (const char *fmt, ...)
{
  va_list args;
  va_start (args, fmt);
  fprintf (stderr, fmt, args);
  va_end (args);
}
#else
#include <octave/oct.h>
void mymessage (const char *fmt, ...)
{
  va_list args;
  va_start (args, fmt);
  message ("aurecord", fmt, args);
  va_end (args);
}
#endif

/* ==================================================================== */
/* Input conversion routines (audio file -> machine representation) */

/* Read a 2 byte signed integer in little endian (Intel) format */
static int from_S16_LE(char *buf, short *sample)
{
#if __BYTE_ORDER == __BIG_ENDIAN
  {
    char t;
    t = buf[0]; buf[0] = buf[1]; buf[1] = t;
  }
#endif
  *sample = *(short *)buf;
  return 2;
}

/* Read a 2 byte signed integer in big endian (non-Intel) format */
static int from_S16_BE(char *buf, short *sample)
{
#if __BYTE_ORDER == __LITTLE_ENDIAN
  {
    char t;
    t = buf[0]; buf[0] = buf[1]; buf[1] = t;
  }
#endif
  *sample = *(short *)buf;
  return 2;
}


/* Read a 2 byte unsigned integer in little endian (Intel) format */
static int from_U16_LE(char *buf, short *sample)
{
#if __BYTE_ORDER == __BIG_ENDIAN
  {
    char t;
    t = buf[0]; buf[0] = buf[1]; buf[1] = t;
  }
#endif
  *sample = (short)((long)(*(unsigned short *)buf) - 32768);
  return 2;
}

/* Read a 2 byte unsigned integer in big endian (non-Intel) format */
static int from_U16_BE(char *buf, short *sample)
{
#if __BYTE_ORDER == __LITTLE_ENDIAN
  {
    char t;
    t = buf[0]; buf[0] = buf[1]; buf[1] = t;
  }
#endif
  *sample = (short)((long)(*(unsigned short *)buf) - 32768);
  return 2;
}

/* Read a 1 byte aLaw compressed value and convert to 2 byte signed integer */
static int from_A_LAW(char *buf, short *sample)
{
  static short alaw[] = {
    -5504,  -5248,  -6016,  -5760,  -4480,  -4224,  -4992,  -4736,
    -7552,  -7296,  -8064,  -7808,  -6528,  -6272,  -7040,  -6784,
    -2752,  -2624,  -3008,  -2880,  -2240,  -2112,  -2496,  -2368,
    -3776,  -3648,  -4032,  -3904,  -3264,  -3136,  -3520,  -3392,
    -22016, -20992, -24064, -23040, -17920, -16896, -19968, -18944,
    -30208, -29184, -32256, -31232, -26112, -25088, -28160, -27136,
    -11008, -10496, -12032, -11520,  -8960,  -8448,  -9984,  -9472,
    -15104, -14592, -16128, -15616, -13056, -12544, -14080, -13568,
    -344,   -328,   -376,   -360,   -280,   -264,   -312,   -296,
    -472,   -456,   -504,   -488,   -408,   -392,   -440,   -424,
    -88,    -72,   -120,   -104,    -24,     -8,    -56,    -40,
    -216,   -200,   -248,   -232,   -152,   -136,   -184,   -168,
    -1376,  -1312,  -1504,  -1440,  -1120,  -1056,  -1248,  -1184,
    -1888,  -1824,  -2016,  -1952,  -1632,  -1568,  -1760,  -1696,
    -688,   -656,   -752,   -720,   -560,   -528,   -624,   -592,
    -944,   -912,  -1008,   -976,   -816,   -784,   -880,   -848 };
  unsigned char t;

  t = *(unsigned char *)buf;
  if (t>=128) *sample = -alaw[t&0x7F];
  else *sample = alaw[t&0x7F];
  return 1;
}

/* Read a 1 byte uLaw compressed value and convert to 2 byte signed integer */
static int from_MU_LAW(char *buf, short *sample)
{
  static short ulaw[] = {
    -32124, -31100, -30076, -29052, -28028, -27004, -25980, -24956,
    -23932, -22908, -21884, -20860, -19836, -18812, -17788, -16764,
    -15996, -15484, -14972, -14460, -13948, -13436, -12924, -12412,
    -11900, -11388, -10876, -10364,  -9852,  -9340,  -8828,  -8316,
    -7932,  -7676,  -7420,  -7164,  -6908,  -6652,  -6396,  -6140,
    -5884,  -5628,  -5372,  -5116,  -4860,  -4604,  -4348,  -4092,
    -3900,  -3772,  -3644,  -3516,  -3388,  -3260,  -3132,  -3004,
    -2876,  -2748,  -2620,  -2492,  -2364,  -2236,  -2108,  -1980,
    -1884,  -1820,  -1756,  -1692,  -1628,  -1564,  -1500,  -1436,
    -1372,  -1308,  -1244,  -1180,  -1116,  -1052,   -988,   -924,
    -876,   -844,   -812,   -780,   -748,   -716,   -684,   -652,
    -620,   -588,   -556,   -524,   -492,   -460,   -428,   -396,
    -372,   -356,   -340,   -324,   -308,   -292,   -276,   -260,
    -244,   -228,   -212,   -196,   -180,   -164,   -148,   -132,
    -120,   -112,   -104,    -96,    -88,    -80,    -72,    -64,
    -56,    -48,    -40,    -32,    -24,    -16,     -8,      0};
  unsigned char t;

  t = *(unsigned char *)buf;
  if (t>=128) *sample = -ulaw[t&0x7F];
  else *sample = ulaw[t&0x7F];
  return 1;
}

/* Read a 1 byte unsigned value and convert to 2 byte signed integer */
static int from_U8(char *buf, short *sample)
{
  unsigned char t;

  t = *(unsigned char *)buf;
  *sample = (t-128)<<8;
  return 1;
}

/* Read a 1 byte unsigned value and convert to 2 byte signed integer */
static int from_S8(char *buf, short *sample)
{
  unsigned char t;

  t = *(unsigned char *)buf;
  *sample = t<<8;
  return 1;
}

/* ===================================================================== */
/* Audio device routines */

/* Okay, now for the OS specific audio code:
 *
 * audioopen(int rate, int channels) returns true if the audio device
 * has been opened.  This routine must set the global variables
 * audiorate and audiochannels to the actual rate and channels
 * selected for the device which may be different from those
 * requested.  This routine must also set audioconvert, the function
 * which takes the machine representation for samples (2 byte signed
 * integers) and converts them to the audio format specified for the
 * audio device.
 *
 * audioplay(void *data, int length) returns true if data was played.
 * The data has already been converted to the correct rate, number of
 * channels and audio format for the device.  The length is the number
 * of BYTES to play (not the number of samples).
 *
 * audioclose() closes the audio device.  */

typedef int (*CONVERSION)(char *buf, short *sample);
static CONVERSION audioconvert;
static int audiorate;
static int audiochannels;

/* ==================================================================== */
#if 1  /* LINUX OSS audio drivers */
#include <linux/soundcard.h>

static int audio = -1;
int audioopen(int rate, int channels)
{
  int format, outformat, mask;

  /* Open audio device */
  audio = open("/dev/dsp", O_RDONLY);
  if (audio < 0) return -1;

  /* Set channels (mono vs. stereo) and remember what was set */
  --channels;
  if (ioctl(audio, SNDCTL_DSP_STEREO, &channels) < 0) goto error;
  audiochannels = channels+1;

  /* Set input format. Convert to a format which preserves the most
   * bits if the selected format is unavailable.
   */
#if __BYTE_ORDER == __LITTLE_ENDIAN
  outformat = format = AFMT_S16_LE, audioconvert=from_S16_LE;
#else
  outformat = format = AFMT_S16_BE, audioconvert=from_S16_BE;
#endif
  if (ioctl(audio, SNDCTL_DSP_SETFMT, &outformat) < 0) goto error;
  if (outformat != format) {
    if (ioctl(audio, SNDCTL_DSP_GETFMTS, &mask) < 0) goto error;
    if (mask&AFMT_S16_LE)      format = AFMT_S16_LE, audioconvert=from_S16_LE;
    else if (mask&AFMT_S16_BE) format = AFMT_S16_BE, audioconvert=from_S16_BE;
    else if (mask&AFMT_U16_LE) format = AFMT_U16_LE, audioconvert=from_U16_LE;
    else if (mask&AFMT_U16_BE) format = AFMT_U16_BE, audioconvert=from_U16_BE;
    else if (mask&AFMT_MU_LAW) format = AFMT_MU_LAW, audioconvert=from_MU_LAW;
    else if (mask&AFMT_A_LAW)  format = AFMT_A_LAW,  audioconvert=from_A_LAW;
    else if (mask&AFMT_U8)     format = AFMT_U8,     audioconvert=from_U8;
    else if (mask&AFMT_S8)     format = AFMT_S8,     audioconvert=from_S8;
    else goto error;
    if (ioctl(audio, SNDCTL_DSP_SETFMT, &format) < 0) goto error;
  }

  /* Set sample rate and remember what was set. */
  if (ioctl(audio, SNDCTL_DSP_SPEED, &rate) < 0) goto error;
  audiorate = rate;
  return 1;

error:
  close(audio);
  return 0;
}

static short audiosample()
{
  static char buf[2048];
  static int bufpos = sizeof(buf);
  int len;
  short sample;

  if (bufpos >= sizeof(buf)) {
    len = read(audio, buf, sizeof(buf));
    while (len < sizeof(buf)) buf[len++] = 0;
    bufpos = 0;
  }
  bufpos += (*audioconvert)(buf+bufpos, &sample);
  return sample;
}

void audioclose()
{
  close(audio);
  audio = -1;
}

void audioabort() 
{
  if (audio != -1) {
    ioctl(audio, SNDCTL_DSP_RESET, NULL);
    audioclose();
  }
}
#endif

void inform(const char *str)
{
  if (str != NULL) {
#if 0
    mymessage ("\r%-38s", str);
#else
    mymessage ("%s\n", str);
#endif
  }
  else
    mymessage ("\n");
}

int capture(int rate, short *capturebuf, int capturelen)
{
  // Note: initial silence is WINDOW+2*STEP
  const float STEP=0.010;             // step size in sec
  const float WINDOW=0.016;   // window size in sec
  const long ENDSILENCE=700;      // duration of end silence in msec 
  const long MINLENGTH=300;       // minimum utterance in msec
  
  endpointer *ep;
  int framelen, framestep;
  short *frame;
  int framenumber=0; /* Currently active frame number */
  int framepos = 0;
  int capturepos, captureend, remaining;
  EPTAG tag, state=EP_RESET;

  /* initialize capture */
  framelen = (int)(WINDOW*(float)rate);
  framestep = (int)(STEP*(float)rate);
  frame = new short[framelen];
  ep = new endpointer(rate, framestep, framelen, ENDSILENCE, MINLENGTH);

  while (1) {
    /* Fill the next frame */
    while (framepos < framelen) frame[framepos++] = audiosample();
    framenumber++;

    /* Process frame through the end point detector */
    tag = ep -> getendpoint (frame);// get endpoint tag
#if 0
    mymessage ("     tag=%s, state=%s\n", 
           ep->gettagname(tag), ep->gettagname(state));
#endif
    switch (tag) {      // determine what to do with this frame
    case EP_NOSTARTSILENCE:   // error condition --- restart process
      if (tag == EP_NOSTARTSILENCE)
      inform("Spoke too soon. Wait a bit and try again...");
      ep->initendpoint();
      framenumber = 0;
      // fall through to RESET
      
    case EP_RESET:            // false start --- restart recognizer
      // fall through to SILENCE
      
    case EP_SILENCE:          // not yet start of utterance
      if (state != EP_SILENCE && framenumber > 3) {
      inform("Waiting for you to speak...");
      state = EP_SILENCE;
      }
      capturepos = 0;
      break;
      
    case EP_MAYBEEND:         // possible end of utterance
      if (tag == EP_MAYBEEND) captureend = capturepos;
      // fall through to SIGNAL
      
      
    case EP_NOTEND:           // the last MAYBEEND was NOT the end 
      if (tag == EP_NOTEND) captureend = 0;
      // fall through to SIGNAL
      
    case EP_INUTT:            // confirmed signal start
      // all data frames before this marked as EP_SIGNAL were part
      // of the actual utterance.  A reset after this point will be
      // due to a rejected signal rather than a false start.
      if (state != EP_INUTT) {
      inform("Capturing your speech...");
      state = EP_INUTT;
      }
      // fall through to SIGNAL
      
    case EP_SIGNAL:           // signal frame
      // Copy frame into capture buf.
      remaining = capturelen - capturepos;
      if (remaining > framestep) remaining = framestep;
      if (remaining > 0) 
      memcpy(capturebuf+capturepos, frame, remaining*sizeof(*frame));
      capturepos += remaining;
      
      // Check for end of capture buf.
      if (capturepos == capturelen) {
      if (captureend == 0) captureend = capturepos;
      inform("Speech exceeded capture duration. Use -t to increase.");
      inform(NULL);
      return captureend;
      }
      break;
      
    case EP_ENDOFUTT:         // confirmed end of utterance
      // This is a silence frame after the end of signal.  The previous
      // MAYBEEND frame was the actual end of utterance
      inform(NULL);
      return captureend;
    }

    /* Shift the frame overlap to the start of the frame. */
    framepos = framelen - framestep;
    memmove(frame, frame+framestep, framepos*sizeof(*frame));
  }

  return 0;
}


void cleanup(int sig)
{
  audioabort();
  exit(2);
}

#ifdef TEST

int main(int argc, char *argv[]) 
{
   int do_endpoint = 0;
   int rate=16000, channels=1;
   double time=1;
   short *buf;
   int i, c, samples;


   /* Interpret options */
   do {
     c = getopt(argc, argv, "et:r:c:?");
     switch (c) {
     case 'e': do_endpoint = 1; break;
     case 'r': rate = atoi(optarg); break;
     case 'c': channels = atoi(optarg); break;
     case 't': time = atof(optarg); break;
     case '?': 
       fprintf (stderr, "usage: aurecord [-t time] [-r rate] [-c channels]\n");
       exit(1);
     }
   } while (c != EOF);
  
   /* Prepare for interrupt. */
   signal(SIGINT, cleanup);

   /* open audio device and skip the first bunch of samples */
   if (audioopen(rate, channels) < 0) return 1;
   for (i = 0; i < 1024; i++) audiosample();

   fwrite(&audiorate, 4, 1, stdout);
   fwrite(&audiochannels, 4, 1, stdout);
   samples = (long)((double)audiorate * time)*audiochannels;
   buf = new short[samples];

   if (do_endpoint) {
     /* wait for audio event before grabbing samples */
     samples = capture(audiorate, buf, samples);
   }
   else {
     /* grab all the samples you need directly */
     for (i = 0; i < samples; i++) buf[i] = audiosample();
   }

   /* close the audio device */
   audioclose();

   /* output the captured samples */
   fwrite(buf, 2, samples, stdout);
   return 0;
}

#else

DEFUN_DLD (aurecord, args, nargout,
  "-*- texinfo -*-\n\
@deftypefn {Loadable Function} {[@var{x}, @var{fs}, @var{chan}] =} aurecord (@var{t}, @var{fs}, @var{chan})\n\
@deftypefnx {Loadable Function} {[@var{x}, @var{fs}, @var{chan}] =} aurecord (@var{t}, @var{fs}, @var{chan}, 'endpoint')\n\
\n\
Record for the specified time at the given sample rate. Note that\n\
the sample rate used may not match the requested sample rate.  Use\n\
the returned rate instead of the requested value in further\n\
processing. Similarly, the actual number of samples and channels\n\
may not match the request, so check the size of the returned matrix.\n\
\n\
@var{fs} defaults to 8000 Hz and @var{chan} defaults to 1. @var{time} is\n\
measured in seconds. @code{aurecord} can return the actual number of\n\
channels and the rate that is used, that may different from the ones\n\
selected.\n\
\n\
If the argument 'endpoint' is given, we attempt to wait for audio event\n\
before grabbing samples\n\
@end deftypefn")
{
  int nargin = args.length ();
  octave_value_list retval;

  if (nargin < 1 || nargin > 4)
    print_usage ();
  else
    {
      double time = args (0).double_value ();
      int rate = 16000;
      int channels = 1;
      int do_endpoint = 0;
      short *buf;
      int i, c, samples;
      

      if (nargin > 1)
      rate = args (1).nint_value ();

      if (nargin > 2)
      channels = args (2).nint_value ();

      if (nargin > 3)
      {
        std::string arg = args(3).string_value ();
        if (arg == "endpoint")
          do_endpoint = 1;
      }

      if (! error_state)
      {
        /* Prepare for interrupt. */
        signal(SIGINT, cleanup);

        /* open audio device and skip the first bunch of samples */
        if (audioopen (rate, channels) < 0) 
          error ("aurecord: can not open device");

        for (i = 0; i < 1024; i++) 
          audiosample();

        retval (2) = octave_value (audiochannels);
        retval (1) = octave_value (audiorate);

        samples = (long)((double)audiorate * time)*audiochannels;
        OCTAVE_LOCAL_BUFFER (short, buf, samples);

        if (do_endpoint) {
          /* wait for audio event before grabbing samples */
          samples = capture(audiorate, buf, samples);
        }
        else {
          /* grab all the samples you need directly */
          for (i = 0; i < samples; i++) buf[i] = audiosample();
        }

        /* close the audio device */
        audioclose();

        /* output the captured samples */
        Matrix buf2 (samples / audiochannels, audiochannels);
        for (i = 0; i < samples; i++)
          buf2.xelem (i) = static_cast <double> (buf[i]) / 32768.; 

        retval(0) = buf2;
      }
    }

  return retval;
}

#endif

Generated by  Doxygen 1.6.0   Back to index