/***************************************************************************
 *   Copyright (C) 2008 by elsamuko                                 *
 *   elsamuko@web.de                                                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 3 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

//small programm to create markovized text
//compile with:
//g++ -O2 -Wall -o markov markov.cpp

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <istream>
#include <fstream>
#include <sstream>
#include <map>
#include <algorithm>
#include <string>
#include <iterator>
#include <iomanip>
#include <algorithm>
#include <cstdlib>
#include <boost/tokenizer.hpp>
#include <boost/assign/std/vector.hpp> // for 'operator+=()'
#include <boost/assert.hpp>
// #include <boost/multi_index_container.hpp>
// #include <boost/multi_index/sequenced_index.hpp>
#include <boost/multi_index/ordered_index.hpp>
#include <boost/multi_index/identity.hpp>
#include <boost/algorithm/string.hpp>
#include <time.h>

using namespace std;
using namespace boost;

bool DEBUG;


int strToInt( string const& s ) {
    int x;
    std::istringstream in( s );
    in >> x;
    return x;
}


map< string, map<string, double> > stringToMap( string text ) {
    map<string, map<string, double> > wordList;
    typedef tokenizer<char_separator<char> > tokenizer;

    // tokenizer tok( text, char_separator<char>( " \t\n();:!?'\"-" ) );
    tokenizer tok( text, char_separator<char>( " \n\t()-—\"" ) );
    tokenizer::iterator tok_iter_next;

    for ( tokenizer::iterator tok_iter = tok.begin(); next( tok_iter ) != tok.end(); ++tok_iter ) {
        if ( DEBUG ) cout << *tok_iter << endl;
        tok_iter_next = next( tok_iter );
        wordList[*tok_iter][*tok_iter_next]++;
    }

    return wordList;
}


void normalizeMap( map< string, map<string, double> >& wordList ) {
    typedef map< string, map< string, double > >::iterator iter;
    typedef map< string, double>::iterator iter2;

    for ( iter row = wordList.begin(); row != wordList.end(); row++ ) {
        int sum = 0;
        if ( DEBUG ) cout << row->first << endl;
        for ( iter2 col = row->second.begin(); col != row->second.end(); col++ ) {
            sum += col->second;
        }
        for ( iter2 col = row->second.begin(); col != row->second.end(); col++ ) {
            col->second = col->second / sum;
            if ( DEBUG ) cout << "\t" << col->first << " " << col->second << endl;
        }
    }
}


string createText( map< string, map<string, double> > wordList, const string& firstWord, const int numWords ) {
    string text = firstWord;
    double rnd;
    double sum;
    string curWord = firstWord;
    typedef map< string, double>::iterator iter;

    for ( int i = 0; i < numWords; ++i ) {
        rnd = double( rand() ) / RAND_MAX; //random nr between 0 and 1
        if ( DEBUG ) cout << curWord << " " << rnd << endl;
        sum = 0.0;

        int run = 1;
        iter col = wordList[curWord].begin();
        while ( run && ( col != wordList[curWord].end() ) ) {
            if ( DEBUG ) cout << rnd << "\t" << sum << "\t" << col->second << "\t" << col->first << endl;
            if (( rnd >= sum ) && ( rnd < ( sum + col->second ) ) ) {
                curWord = col->first;
                text += " ";
                text += curWord;
                //cout << curWord << endl;
                run = 0;
            }
            sum = sum + col->second;
            col++;
        }
    }
    return text;
}


string fileToString( const string& filename ) {
    ifstream in( filename.c_str() );
    stringstream ret;
    string Line;

    if ( !in ) {
        cerr << "Can't open " << filename << "." << endl;
    } else {
        while ( getline( in, Line ) ) {
            ret << Line << endl;
        }
        if ( DEBUG ) cout << filename << " read." << endl;
        if ( ret.str().size() == 0 ) {
            cout << filename << " is empty." << endl;
        }
    }
    return ret.str();
}


void usage() {
    cout << "Usage: markov [options] textfile" << endl;
    cout << "       -h or --help      This message" << endl;
    cout << "       -d or --debug     Debug output" << endl;
    cout << "       -f or --first     Set the first word" << endl;
    cout << "       -c or --count     Set the number of words" << endl;
    cout << endl;
}


int main( int argc, char *argv[] ) {

    if ( argc > 1 ) {

        // default values
        string numWords = "20";
        string firstWord;
        string fileIn;
        DEBUG = false;

        // set options
        for ( int i = 1; i < argc; i++ ) {
            const char *sw = argv[i];

            if ( !strcmp( sw, "-h" ) || !strcmp( argv[i], "--help" ) ) {
                usage();
                return 0;
            } else if ( !strcmp( sw, "-c" ) || !strcmp( sw, "--count" ) ) {
                if ( i + 1 >= argc ) {
                    usage();
                    return 1;
                }
                numWords = argv[++i];
            } else if ( !strcmp( sw, "-f" ) || !strcmp( sw, "--first" ) ) {
                if ( i + 1 >= argc ) {
                    usage();
                    return 1;
                }
                firstWord = argv[++i];
            } else if ( !strcmp( sw, "-d" ) || !strcmp( sw, "--debug" ) ) {
                DEBUG = true;
            } else {
                fileIn += sw;
            }
        }

        if ( fileIn.size() != 0 ) {

            srand( time( NULL ) ); //initialize rng

            string file = fileToString( fileIn );
            replace_all( file, ".", " . " );
            replace_all( file, "?", " ? " );
            replace_all( file, "!", " ! " );
            //replace_all(file, "/n", " ");
            if ( DEBUG ) cout << file << endl;
            if ( DEBUG ) cout << "------------------------------------------------------" << endl;

            int count = strToInt( numWords );
            map<string, map<string, double> > wordList = stringToMap( file );

            if ( firstWord.size() == 0 ) {
                firstWord = wordList.begin()->first;
            }

            normalizeMap( wordList );
            if ( DEBUG ) cout << "------------------------------------------------------" << endl;
            string text = createText( wordList, firstWord, count );
            if ( DEBUG ) cout << "------------------------------------------------------" << endl;
            replace_all( text, " . ", ". " );
            replace_all( text, " ? ", "? " );
            replace_all( text, " ! ", "! " );
            cout << text << endl;


        } else {
            cout << "Error: No textfile given" << endl;
            usage();
        }

    } else {
        usage();
    }

    return 0;
}

//EOF
