// FILE: html.h // // This is some utilites for CS23 search engine project ( or general case) // // HTML parser utility implementation // see html.h for detail usage #ifndef _HTML__H_ #define _HTML__H_ //! This is a html parser to retrieve URL from a html page. //! //! \param html the html data to be parsed, //! \param urlofthispage the current absolute URL path of the HTML page passed in by \a html, since many websites using relative path //! in there webpages, and we need the absolute URL as output, so we require the current url. //! \param result the result URL will be written in this buffer. Users should allocate it and set it to zero before calling this function. //! \param pos from which pos in the HTML do we start parse? //! \return 1 + the pos of the new founded URL in HTML, -1 if end of doc is reached. //! \warning sometimes return an empty \a result which means a URL path we cannot understand like "../../a.html". //! \a Usage Example (retrieve all URL in a page) //! int pos = 0; //! char result[1000]; //! BZERO(result, 1000); //! while ((pos = GetNextURL(html, urlofthispage, result, pos)) > 0) { //! /* DO SOMETHING WITH THE RESULT URL */ //! BZERO(result, 1000); //! } //! Here you retrieve all the URL from the documents. One URL in each loop. //! \warning Make sure that every time you call this, you've BZEROed the \a result. //! \warning Make sure result is big enough to hold any URL. int GetNextURL(char* html, char* urlofthispage, char* result, int pos); //! \brief Make all letters in word be in lower cases. void NormalizeWord(char* word); //! \brief normalize URL //! //! \param URL to url to be normalized //! \return 1 if this url is pure text format (html/php/jsp), 0 if it is of other type (pdf/jpg........) int NormalizeURL(char* URL); //! \brief removes all white space char's in a large string void removeWhiteSpace(char* html); #define IS_ALPHA(c) ((('a'<=(c))&&((c)<='z'))||(('A'<=(c))&&((c)<='Z'))) #endif