/* htmlparse.cc * * Read html an file and generate a tag tree structure. The copyright for the following is held by Neil Nelson but may be copied or modified by Jure Sah for the purpose of automating or increasing the HTML item extraction ability of the MMS or MesonAI related objectives of which Neil Nelson is a part. * */ #include #include #include #include #include #include #include #include #include #include #include #include "htmlparse.h" // #define DEBUG_PARSI // #define DEBUG_TREE #define JURES_VERSION // extern ifstream ihtml; // extern char *res; // debug #ifdef JURES_VERSION int ihtml_eof; int shut_down_signal=0; void shut_down_handler(int dummy); #else extern int ihtml_eof; extern int shut_down_signal; extern void shut_down_handler(int dummy); #endif extern string get_html_char(); string get_lower(string cap_word, uint length); void delete_html_tag_tree(html_tag* wrk_tag); html_tag* get_last_html_tag(html_tag* wrk_tag); int get_html_file_list(string directory_name); void out_of_memory(); // html_tag *get_html_tags() Html_Tag_Classes get_html_tags(Html_Tag_Classes html_tag_classes) { signal(30, shut_down_handler); html_tag *cur_tag, *top_tag, *new_tag, *wrk_tag, *prev_tag, *bot_tag, *temp_tag; Anchor_Tag *top_anchor_tag, *cur_anchor_tag, *new_anchor_tag; top_anchor_tag=0; string schar, prev_schar="", sstr="", wrk_str; string::size_type wrk_str_len; int next_dir=0, end_of_string, tags_created=0, tags_deleted=0; // 1 = down, 2 = right. int anchor_tags_created=0; sstr = ""; schar = get_html_char(); while(schar != "<") { sstr += schar; schar = get_html_char(); } top_tag = cur_tag = new html_tag; tags_created+=1; #ifdef DEBUG_PARSI cout << "new html_tag top_tag= " << top_tag << "\n"; #endif cur_tag->tagvalue = sstr; next_dir=2; // right while(!ihtml_eof) { if (shut_down_signal) { html_tag_classes.top_tree_tag = 0; return html_tag_classes; } #ifdef DEBUG_PARSI cout << "top of loop schar= " << schar << "\n"; #endif // ***************** if (schar == "<") // start tag { #ifdef DEBUG_PARSI cout << "In schar == < " << "\n"; #endif sstr = ""; while(!ihtml_eof && schar != " " && schar != ">" && sstr != "") { sstr += get_html_char(); wrk_str_len = sstr.size(); if (wrk_str_len > 2) wrk_str = sstr.substr(wrk_str_len-3,3); } if (ihtml_eof) { cout << "

error - eof found, expected --> for end of comment.

\n"; cout.flush(); html_tag_classes.top_tree_tag = 0; return html_tag_classes; } sstr = sstr.substr(0,wrk_str_len-3); cur_tag->tagvalue = sstr; cur_tag->tagend = "-->"; sstr = ""; // remove? next_dir = 2; // right // continue at bottom of main loop #ifdef DEBUG_PARSI cout << "In sstr ==