/*

Copyright (c) 2007 Carl Byington - 510 Software Group, released under
the GPL version 3 or any later version at your choice available at
http://www.gnu.org/licenses/gpl-3.0.txt

*/

#ifndef scanner_include
#define scanner_include

#include "dnsbl.h"

////////////////////////////////////////////////
// memory for the content scanner
//
class recorder
{
    mlfiPriv    *priv;      // needed for syslog
    string_set  *html_tags; // valid tags
    string_set  *tlds;      // valid tlds
    string_set  *tldwilds;  // valid wildcard tlds
    string_set  *tldnots;   // invalid tlds
    string_set  hosts;
    size_t      bad_html_tags;
    size_t      binary_tags;

public:
    recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &tldwilds_, string_set &tldnots_);
    ~recorder()                                 { empty(); };
    void empty();
    void new_url(const char *host);
    void new_tag(const char *tag);
    void binary();
    void syslog(const char *buf)                { my_syslog(priv, buf);                                                             };
    mlfiPriv   *get_priv()                      { return priv;                                                                      };
    string_set *get_tlds()                      { return tlds;                                                                      };
    string_set *get_tldwilds()                  { return tldwilds;                                                                      };
    string_set *get_tldnots()                   { return tldnots;                                                                      };
    string_set &get_hosts()                     { return hosts;                                                                     };
    bool        excessive_bad_tags(size_t limit){ return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
    bool        excessive_hosts(size_t limit)   { return (limit > 0) && (hosts.size() > limit);                                     };

};


////////////////////////////////////////////////
// the content scanner
//
class fsa;
class url_scanner {
    fsa *host_parser;
    fsa *tags_parser;
    fsa *urls_parser;
    fsa *urld_parser;
    fsa *html_parser;
    fsa *mime_parser;
    fsa *b64_parser;
    fsa *uu_parser;

public:
    url_scanner(recorder *memory);
    ~url_scanner();
    void scan(u_char *buffer, size_t length);
};

#endif
