/*
 * Makes English sentences more vocabulary-rich
 *
 * Copyright (c) 2006 Jacobo Tarrio <jtarrio@trasno.net>
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to permit
 * persons to whom the Software is furnished to do so, subject to the
 * following conditions:
 * 
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
 * NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#define _GNU_SOURCE
#include <ctype.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <time.h>
#include <link-grammar/link-includes.h>
#include <wn.h>

/* Copy a single sentence from orig to dest and returns the length. If
   dest is NULL, it only returns the length */
size_t copy_sentence(const char *dest, const char *orig) {
  char *destptr = (char *)dest;
  char *origptr = (char *)orig;

  /* Copy up to first sentence delimiter */  
  while ( *origptr &&
         (*origptr != '.') &&
         (*origptr != '!') &&
         (*origptr != '?')) {
    if (destptr) {
      *destptr++ = *origptr;
    }
    origptr++;
  }

  /* Copy while there are still string delimiters or blanks */
  while (  *origptr &&
         ((*origptr == '.') ||
          (*origptr == '!') ||
          (*origptr == '?') ||
          (isspace(*origptr)))) {
    if (destptr) {
      *destptr++ = *origptr;
    }
    origptr++;
  }
  
  /* Add trailing NULL */
  if (destptr)
    *destptr = 0;
  
  /* Return length of sentence */
  return origptr - orig;
}

char *extract_sentence(const char *orig) {
  char *buffer;
  size_t length;
  
  length = copy_sentence(NULL, orig);
  buffer = malloc(sizeof(char) * (length + 1));
  copy_sentence(buffer, orig);
  
  return buffer;
}

/* Tries to imitate orig's capitalization on dest */
void copycapitalization(char *dest, const char *orig) {
  char *ptr = dest;
  int n = strlen(orig);
  if (n < 2)
    return;
  if (!*ptr)
    return;
  
  if (isupper(orig[0])) {
    if (isupper(orig[1])) {
      /* Probably all uppercase */
      while (*ptr) {
        *ptr = toupper(*ptr);
        ptr++;
      }
    }
    else {
      /* Probably uppercase first character */
      *ptr = toupper(*ptr);
    }
  }
}    

/* returns a pointer (which must be freed afterwards) to a synonym of
   the given word */
char *find_synonym(const char *word, char category) {
  SynsetPtr wninfo;
  char *buf;
  int totsyns;
  int i;
  int pos = 0;
  int type = 0;

  switch (category) {
    case 'n':
      pos = NOUN;
      type = HYPERPTR;
      break;
    case 'v':
      pos = VERB;
      type = HYPERPTR;
      break;
    case 'a':
      pos = ADJ;
      type = SIMPTR;
      break;
    case 'e':
      pos = ADV;
      type = SYNS;
      break;
  }

  /* Find synonyms */
  wninfo = findtheinfo_ds((char *)word, pos, type, ALLSENSES);

  if (wninfo) {
    /* Choose a synonym randomly between the first three senses */
    totsyns = wninfo->wcount;
    if (wninfo->nextss) {
      totsyns = totsyns + wninfo->nextss->wcount;
      if (wninfo->nextss->nextss)
        totsyns = totsyns + wninfo->nextss->nextss->wcount;
    }
    i = random() % totsyns;
    if (i < wninfo->wcount)
      buf = strdup(wninfo->words[i]);
    else if (i < wninfo->wcount + wninfo->nextss->wcount)
      buf = strdup(wninfo->nextss->words[i - wninfo->wcount]);
    else
      buf = strdup(wninfo->nextss->nextss->words[i - wninfo->wcount - wninfo->nextss->wcount]);

    /* We have to perform some postprocessing on wordnet's result */
    /* Change _ to space and null-out parentheses and brackets */
    for (i = 0; buf[i]; i++) {
      if (buf[i] == '_')
        buf[i] = ' ';
      if ((buf[i] == '(') || (buf[i] == '['))
        buf[i] = 0;
    }    

    free_syns(wninfo);
  }
  else {
    buf = strdup(word);
  }

  return buf;
}

/* If it finds the word, prints the sentence up to that word then a
   synonym of the word, and returns a pointer to the remainder of the
   sentence. If not, returns a pointer to the beginning of the sentence. */
char *subst_word(char *sentence, const char *word) {
  char *ptr = sentence;
  char *realword;
  char *previous;
  char *syn;
  char category = 0;
  int n = strlen(word);
  char *pos;

  if (n < 2)
    return sentence;

  category = word[n - 1];

  /* If it ends in .[nvae], save the category and replace the dot with 0 */
  if ((word[n - 2] == '.') && ((category == 'n') ||
                               (category == 'v') ||
                               (category == 'a') ||
                               (category == 'e'))) {

    realword = strdup(word);
    realword[n - 2] = 0;

    /* Remove square brackets */
    if (NULL != (pos = strrchr(realword, '[')))
      *pos = 0;

    /* Find the word */
    ptr = strcasestr(sentence, realword);
    if (!ptr) {
      free(realword);
      return sentence;
    }

    /* Display the sentence up to the word, then the substituted word */    
    previous = strndup(sentence, ptr - sentence);
    syn = find_synonym(realword, category);
    copycapitalization(syn, ptr);
    printf("%s%s", previous, syn);

    /* Advance the pointer to after the word */
    ptr = ptr + strlen(realword);
    free(syn);
    free(previous);
    free(realword);
    
    return ptr;
  }
  return sentence;
}    

/* Parse a sentence and substitute its words */
void parse_sentence(Parse_Options opts, Dictionary dict, const char *sentence) {
  Sentence sent;
  Linkage linkage;
  char *word;
  int linkages, words;
  int i;
  int hasalnum = 0;

  /* Working copy of the sentence, without any linebreaks or so. Determine
     if it doesn't contain any alphanumerics. */  
  char *parsent = strdup(sentence);
  char *ptr = parsent;
  while (*ptr) {
    if (isspace(*ptr))
      *ptr = ' ';
    if (!hasalnum && isalnum(*ptr))
      hasalnum = 1;
    ptr++;
  }

  /* No alphanumerics; print directly */
  if (!hasalnum) {
    printf("%s", sentence);
    free(parsent);
    return;
  }

  /* Parse the sentence with link-grammar */
  sent = sentence_create(parsent, dict);
  linkages = sentence_parse(sent, opts);
  if (linkages > 0) {
    linkage = linkage_create(0, sent, opts);
    words = linkage_get_num_words(linkage);
    /* Substitute each word */
    ptr = (char *)sentence;
    for (i = 0; i < words; i++) {
      word = linkage_get_word(linkage, i);
      ptr = subst_word(ptr, word);
    }
    /* Print the remainder of the sentence */
    printf("%s", ptr);
    linkage_delete(linkage);
  }
  else {
    /* No linkages found: print the sentence directly */
    printf("%s", sentence);
  }
  sentence_delete(sent);
  free(parsent);
}

/* Divide a buffer into sentences and parse each one separately */
int parse_buffer(Parse_Options opts, Dictionary dict, const char *buffer) {
  char *sentence;
  size_t len;
  
  sentence = extract_sentence(buffer);
  parse_sentence(opts, dict, sentence);
  len = strlen(sentence);
  free(sentence);

  return len;
}

/* Main loop */
void run(Parse_Options opts, Dictionary dict) {
  char *buffer;
  char *readptr;
  char *newptr;
  const int fullbuffer = 32768;
  int toread;
  int amtread;
  int amtbuffered = 0; /* Remaining to process */
  int len;

  buffer = malloc(sizeof(char) * fullbuffer);
  readptr = buffer;

  /* This is tricky.
   * We want to have a buffer which is processed one sentence at a time.
   * When the buffer is less than half-full we read more input until it
   * is full again. When input is finished, we empty the buffer.
   * We do all this because we want to process sentences, not lines, and
   * a sentence may be split among several lines.
   */   
  do {
    /* There's something in the buffer; move it to the beginning and
       read some more */
    if (amtbuffered > 0) {
      memmove(buffer, newptr, amtbuffered);
    }

    /* Read until there's nothing more to read or the buffer is full) */
    do {
      readptr = buffer + amtbuffered;
      toread = fullbuffer - amtbuffered;
      amtread = read(0, readptr, toread);
      amtbuffered = amtbuffered + amtread;
    } while ((amtread > 0) && (amtbuffered < fullbuffer));

    /* Parse sentences while buffer is mmore than half-full OR
       input is finished and there's anything left in the buffer */
    newptr = buffer;
    while ((amtbuffered > fullbuffer / 2) ||
           ((amtread == 0) && (amtbuffered > 0))) {
      len = parse_buffer(opts, dict, newptr);
      amtbuffered = amtbuffered - len;
      newptr = newptr + len;
    }
  } while (amtread != 0);
}

int main(void) {
  Dictionary dict;
  Parse_Options opts;

  if (wninit()) {
    perror("Could not initialise WordNet");
    exit(1);
  }
  
  opts = parse_options_create();
  parse_options_set_verbosity(opts, 0);
  parse_options_set_screen_width(opts, 65536);
  parse_options_set_max_null_count(opts, 64);
  
  dict = dictionary_create("en/4.0.dict",
                           "en/4.0.knowledge",
                           "en/4.0.constituent-knowledge",
                           "en/4.0.affix");
  
  if (!dict) {
    perror("Could not load link-grammar's dictionary");
    exit(1);
  }

  srandom(time(NULL));

  run(opts, dict);
                           
  return 0;
}