/* jureparse.cc A file open and memory load example. The copyright for the following is held by Neil Nelson but may be copied or modified by Jure Sah for the purpose of automating or increasing the HTML item extraction ability of the MMS or MesonAI related objectives of which Neil Nelson is a part. Inventory: jureparse.cc This file htmlparse.cc htmlparse.h Google.html Google.ankr Google.tree The objective is to take the input Google.html and obtain the output: Google.ankr, and Google.tree. Compile and link lines from the Makefile using gnu C++. CFLAGS= -g -O3 -Wall -pedantic NFLAGS= -fPIC -c gtfilchr.so: gtfilchr.cc g++ $(NFLAGS) gtfilchr.cc -o gtfilchr.so gthtmchr.so: gthtmchr.cc g++ $(NFLAGS) gthtmchr.cc -o gthtmchr.so htmlparse.so: htmlparse.cc htmlparse.h g++ $(NFLAGS) htmlparse.cc -o htmlparse.so jureparse: jureparse.cc gthtmchr.so htmlparse.so g++ jureparse.cc gthtmchr.so htmlparse.so -o jureparse */ // *** Clearly not all the following are needed for your work, // *** but I just copied them from my working program without trying to figure out // *** which ones could be removed. You can try commenting each one in turn and // *** then seeing if the code compiles without error. #include #include #include #include #include #include #include #include // #include #include #include #include #include "htmlparse.h" char *res; extern int ihtml_eof; char *get_res_test(string file_name) // Load file into res, a memory file or char stream. { ifstream ihtml; // int ihtml_eof; // static int i=0; // debug string schar=""; char raw_char, *chr_out; int end_null=0; // *** Put in your own directory location for the raw html file. string ifile_name = "/home/n_nelson/web_search/nnget/prime_results/" + file_name + ".html" + char(end_null); #ifdef DEBUG_SEARCH // cout << "*** ihtml.open(ifile_name= " << ifile_name << "\n"; #endif ihtml.open(ifile_name.c_str()); if (!ihtml) { cout << "could not open ifile_name= " << ifile_name << "\n"; exit(1); } // int i=0; // debug ihtml.get(raw_char); while(!ihtml.eof()) { schar += raw_char; ihtml.get(raw_char); // i+=1; // if (i < 50) // cout << "get_char raw_char= " << raw_char << " i= " << i << "\n"; // if (i > 600) exit(1); } ihtml.close(); schar+=char(end_null); // ihtml_eof = 0; int res_len = strlen(schar.c_str())+1; chr_out = new char [res_len]; // Be sure to use `delete [] chr_out;' later to clear memory. #ifdef DEBUG_SEARCH // cout << "*** ihtml.open(ifile_name= " << ifile_name << " res_len= " << res_len << "\n"; #endif chr_out = strcpy(chr_out,schar.c_str()); return chr_out; } int main() { string file_name="Google"; res = get_res_test(file_name); // Load file into res int end_null=0; // *** put in your own output location directories here. string tree_file_name = "/home/n_nelson/web_search/nnget/prime_result_trees/" + file_name + ".tree" + char(end_null); string anchor_file_name = "/home/n_nelson/web_search/nnget/prime_result_trees/" + file_name + ".ankr" + char(end_null); Html_Tag_Classes html_tag_classes; cout << "before get_html_tags
\n"; cout.flush(); html_tag_classes = get_html_tags(html_tag_classes); cout << "write_anchor_tags html_tag_classes.top_anchor_tag= " << html_tag_classes.top_anchor_tag << "
\n"; cout.flush(); write_anchor_tags(anchor_file_name, html_tag_classes.top_anchor_tag); cout << "write_html_tree html_tag_classes.top_tree_tag= " << html_tag_classes.top_tree_tag << "
\n"; cout.flush(); write_html_tree(tree_file_name, html_tag_classes.top_tree_tag); delete_html_tag_tree(html_tag_classes.top_tree_tag); delete_anchor_tags(html_tag_classes.top_anchor_tag); delete [] res; return 0; } // *** You can modify the following routine to be part of a single program file or // *** make another program file as I have done. /* gthtmchr.cc Reads char* instead of a disk file. The above code is set up for this routine instead of gtfilchr.cc below. */ #include extern char *res; int ihtml_eof; string get_html_char() { // static int i=0; // debug string schar=""; if (*res == 0) { ihtml_eof=1; return schar = ""; } if (*res < 32) schar = " "; else if (*res > 126) schar = " "; else schar = *res; // i+=1; // if (i < 50) // cout << "get_char schar= " << schar << " i= " << i << "\n"; // if (i > 50) exit(1); res++; return schar; } /* gtfilchr.cc Reads a disk file instead of a char*. */ /* Commented to avoid subroutine name conflict with prior routine. #include #include #include ifstream ihtml; int ihtml_eof=0; string get_html_char() { static int i=0; // debug string schar=""; char raw_char; ihtml.get(raw_char); if (ihtml.eof()) { ihtml_eof=1; return schar = ""; } if (int(raw_char) < 32) schar = " "; else if (int(raw_char) > 126) schar = " "; else schar = raw_char; // i+=1; // if (i < 50) // cout << "get_char schar= " << schar << " raw_char= " << raw_char << " i= " << i << "\n"; // if (i > 50) exit(1); return schar; } */