This is a newer version (especially the source code) of the former blog entry at spin.
How does it work?
- use libxml to parse html
- scan css files for image urls
- sorry, but no javascript :(
- correct relative and absolute urls
- download all these files
Le sourcecode:
#include #include #include #include #include <libxml/parser.h> #include <libxml/HTMLparser.h> #include <libxml/xmlerror.h> #include <curl/curl.h> #include #include "getpage.h" #define FILELENGTH 150 #define CURL_TIMEOUT_SEC 240 #define SELECT_TIMEOUT_SEC 10 #define MAX_P_FILE_DOWNLOADS 10 #define DEBUG static char ALPHABET[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"; /* Get out: * TODO: frames * TODO: javascript file riddles ;) * TODO: wrong content-length -> reload * TODO: same file in 2 different CSS files */ enum FILETYPE { IMG = 0x1, STYLE = 0x2, SCRIPT = 0x4, IFRAME = 0x8, FRAME = 0x10, PDF = 0x20, CSS_IMG = 0x40, NONE = 0x80 }; struct _replace_info { char *begin; char end; void (*userfunction) (void*, char*, int, bool); void *userdata; char *buffer; int begin_progress; int begin_length; bool inside_gap; int status; }; struct _site_files { char *url; char *url2; char *filename; enum FILETYPE ft; struct _site_files *next; FILE *fp; struct _replace_info *ri; short nth_url; #ifdef DEBUG int id; bool done; #endif }; struct _site_userdata { //void (*site_function)(void*, const char*, ...); void (*site_function)(void*, const char*, va_list); void *userdata; struct _site_files *sf; char *_base_url; bool _utf8_meta_set; CURL *_mhnd; CURL *_hnd; }; struct _css_filter_userdata { struct _site_userdata *su; char *url; }; struct _css_filter_save_userdata { struct _site_userdata *su; FILE *fp; char *filename; char *url; char *_css_base_url; }; static char *_filetype_string(enum FILETYPE ft) { char *txt; switch(ft) { case IMG: txt = "IMG"; break; case CSS_IMG: txt = "CSS_IMG"; break; case STYLE: txt = "STYLE"; break; case SCRIPT: txt = "SCRIPT"; break; case IFRAME: txt = "IFRAME"; break; case FRAME: txt = "FRAME"; break; case PDF: txt = "PDF"; break; case NONE: txt = "OTHER"; break; default: txt = "DEFAULT"; break; } return txt; } static void _user_function(struct _site_userdata *su, const char *fmt, ...) { va_list ap; va_start(ap, fmt); su->site_function(su->userdata, fmt, ap); va_end(ap); } static char *__join_together(char *a, char *b, int len_b) { int len_a = 0; int i; char *new; if (a != NULL) len_a += strlen(a); new = realloc(a, len_b+1+len_a); if (new != NULL) { for (i = 0; i < len_b; i++) new[i+len_a] = b[i]; new[len_a+len_b] = '\0'; } return new; } // return true if inside gap -> 1 // return false if outside gap -> -1 static int inline replace_step(struct _replace_info *ri, char txt) { if (txt == ri->begin[ri->begin_progress]) ri->begin_progress++; else ri->begin_progress = 0; if (ri->begin_progress == ri->begin_length) { ri->begin_progress = 0; ri->inside_gap = true; return -1; } if (ri->inside_gap) { if (txt == ri->end) { ri->inside_gap = false; return -1; } else return 1; } return -1; } static void replace(struct _replace_info *ri, char *txt, int length) { int i; int offset = 0; int status_temp = -1; for (i = 0; i < length; i++) { status_temp = replace_step(ri, txt[i]); if (ri->status != status_temp) { if (ri->buffer != NULL) { if (ri->status == 1) { ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), true); } else if (ri->status == -1) { ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), false); } free(ri->buffer); ri->buffer = NULL; } if (ri->status == 1) { ri->userfunction(ri->userdata, txt+offset, i-offset, true); } else if (ri->status == -1) { ri->userfunction(ri->userdata, txt+offset, i-offset, false); } offset = i; } ri->status = status_temp; } if (offset != length) { if (status_temp == 1 || status_temp == -1) { ri->userfunction(ri->userdata, txt+offset, i-offset, ri->status == 1 ? true : false); } else { if (txt[length-1] == '\0') { if (ri->buffer != NULL) { ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), false); } free(ri->buffer); ri->buffer = NULL; ri->userfunction(ri->userdata, txt+offset, length-offset, false); } else ri->buffer = __join_together(ri->buffer, txt+offset, length-offset); } } } static void _set_chnd(CURL *hnd, char *url, void *cbfunction, void *userdata) { curl_easy_setopt(hnd, CURLOPT_INFILESIZE_LARGE, (curl_off_t)-1); curl_easy_setopt(hnd, CURLOPT_URL, url); curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1); curl_easy_setopt(hnd, CURLOPT_FAILONERROR, 0); curl_easy_setopt(hnd, CURLOPT_USERAGENT, "libmessage - btwotch+libmessage@gmail.com"); //curl_easy_setopt(hnd, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.62 Safari/534.3"); curl_easy_setopt(hnd, CURLOPT_RESUME_FROM_LARGE, (curl_off_t)0); curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50); curl_easy_setopt(hnd, CURLOPT_SSLVERSION, 0); curl_easy_setopt(hnd, CURLOPT_TIMECONDITION, 0); curl_easy_setopt(hnd, CURLOPT_TIMEVALUE, 0); curl_easy_setopt(hnd, CURLOPT_CUSTOMREQUEST, NULL); curl_easy_setopt(hnd, CURLOPT_CONNECTTIMEOUT, CURL_TIMEOUT_SEC); curl_easy_setopt(hnd, CURLOPT_TIMEOUT, CURL_TIMEOUT_SEC); curl_easy_setopt(hnd, CURLOPT_HTTPAUTH, 1); curl_easy_setopt(hnd, CURLOPT_ENCODING, NULL); curl_easy_setopt(hnd, CURLOPT_IPRESOLVE, 0); curl_easy_setopt(hnd, CURLOPT_IGNORE_CONTENT_LENGTH, 0); curl_easy_setopt(hnd, CURLOPT_POSTREDIR, 0); curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, cbfunction); curl_easy_setopt(hnd, CURLOPT_WRITEDATA, userdata); curl_easy_setopt(hnd, CURLOPT_FOLLOWLOCATION, 1); curl_easy_setopt(hnd, CURLOPT_NOSIGNAL, 1); curl_easy_setopt(hnd, CURLOPT_AUTOREFERER, 1); curl_easy_setopt(hnd, CURLOPT_ENCODING, "deflate"); curl_easy_setopt(hnd, CURLOPT_SSL_VERIFYHOST, 1); // TODO } static void _filename_gen(struct _site_files *first_sf, char *filename) { int i; bool name_double; struct _site_files *sf; do { name_double = false; srand(1337^filename[0]); for (i = FILELENGTH/2; i < FILELENGTH; i++) filename[i] = ALPHABET[rand()% (strlen(ALPHABET)-1)]; filename[FILELENGTH-1] = '\0'; sf = first_sf; while (sf != NULL && sf->filename != NULL) { if (!strcasecmp(sf->filename, filename)) name_double = true; sf = sf->next; } } while (name_double); } static char* _shrink_url(char *rurl) // remove apostrophes etc. { int length; while (rurl[0] != '\0' && rurl[0] == ' ') rurl++; length = strlen(rurl); for (int i = 0; i < length/2; i++) if (rurl[i] == '\'' || rurl[i] == '\"') { if (rurl[i] == rurl[length-i-1]) { rurl[length-i-1] = '\0'; rurl++; } } else break; return rurl; } static void _crap_sites_aburl(char **abs_url, CURL *hnd, char *rurl, char *static_burl) { int abs_urllen; if (!strncasecmp(rurl, "//", 2)) // gmx-hack { abs_urllen = 5+strlen(rurl)+1; *abs_url = malloc(abs_urllen*sizeof(char)); snprintf(*abs_url, abs_urllen, "http:%s", rurl); } } static void _relative_aburl(char **abs_url, CURL *hnd, char *rurl, char *static_burl, short nth_url) { int abs_urllen; int domain_end = 0, i; int base_len = 0; char *burl = NULL; if (strncasecmp(rurl, "http://", 7) && strncasecmp(rurl, "https://", 8) && strncasecmp(rurl, "ftp://", 6) && strncasecmp(rurl, "file://", 7) && strncasecmp(rurl, "about:", 6) && strncasecmp(rurl, "javascript:", 11)) { if (static_burl != NULL) burl = static_burl; else if (curl_easy_getinfo(hnd, CURLINFO_EFFECTIVE_URL, &burl) != CURLE_OK) { fprintf(stderr, "CURLINFO_EFFECTIVE_URL failed\n"); exit(1); } if (!strncasecmp(burl, "http://", 7)) domain_end = 7; else if (!strncasecmp(burl, "https://", 8)) domain_end = 8; else if (!strncasecmp(burl, "ftp://", 6)) domain_end = 6; else if (!strncasecmp(burl, "file://", 6)) domain_end = strlen(burl); if (nth_url > 0) for (i = domain_end+1; i < strlen(burl); i++) { if (burl[i] == '/') { if (i < strlen(burl)-1) if (burl[i+1] == '/') continue; if (nth_url == 1) { base_len = i; break; } else nth_url--; } } if (nth_url == -1) for (i = strlen(burl); i > domain_end; i--) if (burl[i] == '/') { base_len = i; break; } if (base_len == 0) base_len = strlen(burl); abs_urllen = strlen(rurl) + strlen(burl) + 2; *abs_url = malloc(sizeof(char)*abs_urllen); snprintf(*abs_url, abs_urllen, "%.*s/%s", base_len, burl, rurl); } } static char* _absolute_url(CURL *hnd, char *rurl, char *static_burl, short nth_url) { char *abs_url = NULL; if (nth_url == 1) { _crap_sites_aburl(&abs_url, hnd, rurl, static_burl); if (abs_url != NULL) return abs_url; } _relative_aburl(&abs_url, hnd, rurl, static_burl, nth_url); if (abs_url != NULL) return abs_url; if (nth_url == 1) { int abs_urllen = strlen(rurl)+1; abs_url = malloc(abs_urllen+1); strncpy(abs_url, rurl, abs_urllen); //abs_url = strdup(rurl); } return abs_url; } static char *_site_files_add(struct _site_userdata *su, char *url, char *base_url, enum FILETYPE ft) { struct _site_files *sf = su->sf; char *newurl, *newfilename, *sec_url; int i; int url_length; int filename_length; if (url == NULL) return NULL; url_length = strlen(url)+1; //printf("%s %s\n", su->_base_url, base_url); url = _shrink_url(url); newurl = _absolute_url(su->_hnd, url, (base_url != NULL) ? base_url : su->_base_url, 1); sec_url = _absolute_url(su->_hnd, url, (base_url != NULL) ? base_url : su->_base_url, -1); if (sf != NULL) { if (!strcmp(sf->url, newurl)) { free(newurl); free(sec_url); return sf->filename; } while (sf->next != NULL) { sf = sf->next; if (!strcmp(sf->url, newurl)) { free(newurl); free(sec_url); return sf->filename; } } sf->next = malloc(sizeof(struct _site_files)); sf = sf->next; sf->ri = NULL; sf->next = NULL; } else { sf = malloc(sizeof(struct _site_files)); sf->ri = NULL; su->sf = sf; sf->next = NULL; } sf->filename = NULL; sf->ft = ft; sf->url = newurl; sf->url2 = sec_url; filename_length = strlen(newurl)+1; if (filename_length > FILELENGTH) filename_length = FILELENGTH; newfilename = malloc(sizeof(char)*(filename_length)); strncpy(newfilename, sf->url, filename_length); if (filename_length == FILELENGTH) _filename_gen(sf, newfilename); sf->filename = newfilename; for (i = 0; i < strlen(sf->filename); i++) { if (sf->filename[i] == '/') sf->filename[i] = '_'; else if (sf->filename[i] == '?') sf->filename[i] = '_'; else if (sf->filename[i] == '#') sf->filename[i] = '_'; else if (sf->filename[i] == '@') sf->filename[i] = '_'; else if (sf->filename[i] == '%') sf->filename[i] = '_'; else if (sf->filename[i] == ':') sf->filename[i] = '_'; else if (sf->filename[i] == ' ') sf->filename[i] = '_'; } return sf->filename; } void _save_file_css_save(void *userdata, char *gap, int length, bool gapped) { struct _css_filter_save_userdata *cfsu = (struct _css_filter_save_userdata*) userdata; char *filename; if (cfsu->fp == NULL) { // first call of this func. cfsu->fp = fopen(cfsu->filename, "w"); cfsu->url = NULL; } if (gapped) cfsu->url = __join_together(cfsu->url, gap, length); else if (!gapped && cfsu->url != NULL) { filename = _site_files_add(cfsu->su, cfsu->url, cfsu->_css_base_url, CSS_IMG); fprintf(cfsu->fp, "%s", filename); free(cfsu->url); cfsu->url = NULL; fprintf(cfsu->fp, "%.*s", length, gap); } else fprintf(cfsu->fp, "%.*s", length, gap); } size_t _save_file_css(char *txt, size_t size, size_t nmemb, struct _site_files *sf) // feed the replacer! { if (size == 0 && nmemb == 0 && sf->fp != NULL) { fclose(sf->fp); } else if (sf->fp == NULL) sf->fp=fopen(sf->filename, "w"); if (sf->fp == NULL) { perror("fopen"); return 0; } replace(sf->ri, txt, size*nmemb); return size*nmemb; } size_t _save_file(char *txt, size_t size, size_t nmemb, struct _site_files *sf) { int i; if (size == 0 && nmemb == 0 && sf->fp != NULL) { fclose(sf->fp); } else if (sf->fp == NULL) sf->fp=fopen(sf->filename, "w"); if (sf->fp == NULL) { perror("fopen"); return 0; } for (i = 0; i < size*nmemb; i++) fputc(txt[i], sf->fp); return size*nmemb; } static void _set_css_ri(struct _replace_info *ri, void *userdata, void *userfunction) { ri->begin = "url("; ri->end = ')'; ri->userfunction = userfunction; ri->userdata = userdata; ri->buffer = NULL; ri->begin_progress = 0; ri->begin_length = 4; ri->inside_gap = false; ri->status = -1; } static int _add_download_files(struct _site_files *sf, struct _site_userdata *su, CURL *mhnd, short nth_url) { struct _css_filter_save_userdata *cfsu; CURL *hnd; sf->fp = NULL; sf->nth_url = nth_url; #ifdef DEBUG static int id; sf->id = id++; fprintf(stderr, "Download (id: %d, %s) %s -> %s\n", id, _filetype_string(sf->ft), sf->url, sf->filename); #endif if (sf->ft == STYLE) { sf->ri = malloc(sizeof(struct _replace_info)); cfsu = malloc(sizeof(struct _css_filter_save_userdata)); cfsu->_css_base_url = sf->url; cfsu->fp = NULL; cfsu->filename = sf->filename; cfsu->su = su; _set_css_ri(sf->ri, cfsu, _save_file_css_save); if (nth_url == 1) { hnd = curl_easy_init(); _set_chnd(hnd, sf->url, _save_file_css, sf); } else if (nth_url == 2) { if (sf->url2 != NULL) { hnd = curl_easy_init(); _set_chnd(hnd, sf->url2, _save_file_css, sf); } else return -1; } else return -1; } else { if (nth_url == 1) { hnd = curl_easy_init(); _set_chnd(hnd, sf->url, _save_file, sf); } else if (nth_url == 2) { if (sf->url2 != NULL) { hnd = curl_easy_init(); _set_chnd(hnd, sf->url2, _save_file, sf); } else return -1; } else return -1; } curl_easy_setopt(hnd, CURLOPT_PRIVATE, sf); curl_multi_add_handle(mhnd, hnd); #ifdef DEBUG sf->done = false; #endif return 1; } static void _download_files(struct _site_userdata *su) { int handles = 1, msgs_in_queue, maxfd; int iteration = 0; int downloads = 0; // current downloads char *curlinfo_private; CURL *mhnd; CURLMsg *cmsg; struct _site_files *first_sf = su->sf; struct _site_files *sf = first_sf; struct _site_files *tmp_sf; struct timeval timeout; fd_set fdread, fdwrite, fderr; char *burl; #ifdef DEBUG char *ip; #endif long response_code; if (sf == NULL) return; mhnd = curl_multi_init(); if (_add_download_files(sf, su, mhnd, 1) > 0) downloads++; sf = sf->next; while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) && handles != 0); do { iteration++; FD_ZERO(&fdread); FD_ZERO(&fdwrite); FD_ZERO(&fderr); timeout.tv_sec = SELECT_TIMEOUT_SEC; timeout.tv_usec = 0; curl_multi_fdset(mhnd, &fdread, &fdwrite, &fderr, &maxfd); switch(select(maxfd+1, &fdread, &fdwrite, &fderr, &timeout)) { case -1: #ifdef DEBUG fprintf(stderr, "select bad :(\n"); perror("!!! select failed "); while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL) { if (cmsg->data.result != 0) { curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIMARY_IP, &ip); fprintf(stderr, "ip: %s url: %s result: %d", ip, burl, cmsg->data.result); if (cmsg->data.result == 7) fprintf(stderr, " (couldn't connect)"); fprintf(stderr, "\n"); } } fprintf(stderr, "-----------\n"); #endif default: while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL) if (cmsg->msg == CURLMSG_DONE) { curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE, &curlinfo_private); tmp_sf = (struct _site_files*)curlinfo_private; if (tmp_sf->ft == CSS_IMG) _save_file_css(NULL, 0, 0, tmp_sf); else _save_file(NULL, 0, 0, tmp_sf); downloads--; #ifdef DEBUG tmp_sf->done = true; #endif curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl); curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE, &response_code); if (response_code >= 400) { if (tmp_sf->nth_url == 2) { fprintf(stderr, "Failed (%ld): %s -> %s (%s) ", response_code, burl, tmp_sf->filename, _filetype_string(tmp_sf->ft)); fprintf(stderr, "second url: %s\n", tmp_sf->url2); } else { if (_add_download_files(tmp_sf, su, mhnd, 2) > 0) downloads++; } } curl_easy_cleanup(cmsg->easy_handle); } do { // download 1st file if (iteration == 1 && sf != NULL && downloads < MAX_P_FILE_DOWNLOADS) { if (_add_download_files(sf, su, mhnd, 1) > 0) downloads++; } while (sf != NULL && sf->next != NULL && downloads < MAX_P_FILE_DOWNLOADS) // sf->next must not be NULL as we are adding to the list ;) { if (_add_download_files(sf->next, su, mhnd, 1) > 0) downloads++; sf = sf->next; } } while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) && handles != 0); break; } } while(handles != 0); while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL) { curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE, &curlinfo_private); tmp_sf = (struct _site_files*)curlinfo_private; if (tmp_sf->ft == CSS_IMG) _save_file_css(NULL, 0, 0, tmp_sf); else _save_file(NULL, 0, 0, tmp_sf); downloads--; curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl); curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE, &response_code); if (response_code >= 400) { if (tmp_sf->nth_url == 2) { fprintf(stderr, "Failed (%ld): %s -> %s (%s) ", response_code, burl, tmp_sf->filename, _filetype_string(tmp_sf->ft)); fprintf(stderr, "second url: %s\n", tmp_sf->url2); } else { if (_add_download_files(tmp_sf, su, mhnd, 2) > 0) downloads++; } } curl_easy_cleanup(cmsg->easy_handle); #ifdef DEBUG tmp_sf->done = true; #endif } sf = first_sf; while (sf != NULL) { #ifdef DEBUG printf("id: %d url: %s url2: %s done: %d\n", sf->id, sf->url, sf->url2, sf->done); #endif free(sf->url); free(sf->url2); free(sf->filename); if (sf->ri != NULL) { free(sf->ri->userdata); free(sf->ri); } tmp_sf = sf; sf = sf->next; free(tmp_sf); } #ifdef DEBUG if (downloads != 0) { printf("!!downloads: %d (%s)\n", downloads, su->_base_url); exit(-1); } #endif curl_multi_cleanup(mhnd); } void _css_filter(void *userdata, char *gap, int length, bool gapped) { struct _css_filter_userdata *cfu = (struct _css_filter_userdata*) userdata; char *filename; if (gapped) cfu->url = __join_together(cfu->url, gap, length); else if (!gapped && cfu->url != NULL) { filename = _site_files_add(cfu->su, cfu->url, NULL, CSS_IMG); _user_function(cfu->su, "%s", filename); free(cfu->url); cfu->url = NULL; _user_function(cfu->su, "%.*s", length, gap); } else _user_function(cfu->su, "%.*s", length, gap); } static void _getpage_startElementSAX (void * userData, const xmlChar * name, const xmlChar ** atts) { int i, j; char *n = (char*)name; char *filename, *url; struct _site_userdata *su = userData; struct _css_filter_userdata cfu; struct _replace_info ri; _user_function(su, "<%s", n); if (atts != NULL) for (i = 0; atts[i] != NULL; i+=2) { filename = NULL; if (!strncasecmp(n, "img", 4) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp(n, "input", 6) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp(n, "script", 7) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, SCRIPT); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp(n, "iframe", 7) && !strncasecmp((char*)atts[i], "src", 4)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IFRAME); _user_function(su, " src=\"file:%s\"", filename); } else if (!strncasecmp((char*)atts[i], "style", 6)) { cfu.su = su; cfu.url = NULL; _set_css_ri(&ri, &cfu, _css_filter); _user_function(su, " style=\""); replace(&ri, (char*)atts[i+1], strlen((char*)atts[i+1])); if (cfu.url != NULL) free(cfu.url); _user_function(su, "\""); filename = (void*)-1; } else if (!strncasecmp(n, "link", 5) && !strncasecmp((char*)atts[i], "href", 5)) { for (j = 0; atts[j] != NULL; j+=2) if (!strncasecmp((char*)atts[j], "rel", 4)) { if (!strncasecmp((char*)atts[j+1], "stylesheet", 11)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, STYLE); _user_function(su, " href=\"file:%s\"", filename); } else if (!strncasecmp((char*)atts[j+1], "icon", 5)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " href=\"file:%s\"", filename); } else if (!strncasecmp((char*)atts[j+1], "shortcut icon", 14)) { filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG); _user_function(su, " href=\"file:%s\"", filename); } } } else if (!strncasecmp(n, "a", 2) && !strncasecmp((char*)atts[i], "href", 5)) { url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url, 1); _user_function(su, " href=\"%s\"", url); free(url); filename = (void*)-1; } else if (!strncasecmp(n, "base", 5) && !strncasecmp((char*)atts[i], "href", 5)) { _user_function(su, " href=\".\""); filename = (void*)-1; } else if (!strncasecmp(n, "form", 5) && !strncasecmp((char*)atts[i], "action", 7)) { url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url, 1); _user_function(su, " action=\"%s\"", url); free(url); filename = (void*)-1; } else if (!strncasecmp(n, "meta", 5) && !strncasecmp((char*)atts[i], "http-equiv", 8) && !strncasecmp((char*)atts[i+1], "Content-Type", 13)) { su->_utf8_meta_set = true; _user_function(su, " http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\""); //filename = (void*)-1; break; } if (filename == NULL) _user_function(su, " %s=\"%s\"", (char*)atts[i], (char*)atts[i+1]); } _user_function(su, ">"); } static void _getpage_endElementSAX (void * userData, const xmlChar * name) { char *n = (char*)name; struct _site_userdata *su = userData; if (!strncasecmp("head", n, 5) && !su->_utf8_meta_set) _user_function(su, "<meta http-equiv="\"Content-Type\"" content="\"text/html;" charset="utf-8\"/" /> "); else if (strncasecmp("br", n, 3) && strncasecmp("img", n, 4) && strncasecmp("meta", n, 5) && strncasecmp("link", n, 5) && strncasecmp("input", n, 5)) _user_function(su, "\n", n); } static void _getpage_charDataSAX (void * userData, const xmlChar * buffer, int length) { struct _site_userdata *su = userData; _user_function(su, "%.*s", length, buffer); } static size_t _chunk_parse(void *ptr, size_t size, size_t nmemb, xmlParserCtxtPtr ctxt) { char *txt = ptr; #ifdef DEBUG FILE *fp = fopen("bare.txt", "a+"); fprintf(fp, "%.*s", (int)(size*nmemb), txt); fclose(fp); #endif htmlParseChunk(ctxt, txt, size*nmemb, 0); return nmemb*size; } void getpage(char *url, void *site_function, void *userdata) { struct _site_userdata su; su.site_function = site_function; su.userdata = userdata; su.sf = NULL; su._utf8_meta_set = false; su._base_url = NULL; CURLcode ret; htmlSAXHandler hsh; htmlParserCtxtPtr ctxt; #ifdef DEBUG remove("bare.txt"); #endif memset(&hsh, 0, sizeof(htmlSAXHandler)); hsh.startElement = _getpage_startElementSAX; hsh.endElement = _getpage_endElementSAX; hsh.characters = _getpage_charDataSAX; ctxt = htmlCreatePushParserCtxt(&hsh, &su, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8); htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER); curl_global_init(CURL_GLOBAL_ALL); su._hnd = curl_easy_init(); _set_chnd(su._hnd, url, _chunk_parse, ctxt); ret = curl_easy_perform(su._hnd); htmlParseChunk(ctxt, NULL, 0, 1); htmlFreeParserCtxt(ctxt); curl_easy_getinfo(su._hnd, CURLINFO_EFFECTIVE_URL, &su._base_url); #ifdef DEBUG double val; if (curl_easy_getinfo(su._hnd, CURLINFO_SPEED_DOWNLOAD, &val) == CURLE_OK) printf("Average download speed: %0.3f kbyte/sec.\n", val / 1024); #endif fprintf(stderr, "Downloading files ...\n"); _download_files(&su); curl_easy_cleanup(su._hnd); // curl_global_cleanup(); }
(save it as getpage.c)
To use that library:
#include #include #include "getpage.h" void site_function(void *userdata, const char* format, va_list ap) { FILE *fp = userdata; vfprintf(fp, format, ap); fflush(fp); } int main(int argc, char **argv) { FILE *fp = fopen(argv[2], "w"); if (fp == NULL) return -1; getpage(argv[1], site_function, fp); fclose(fp); }
(save that as getpagetest.c)
and now the Makefile (tabulators!!):
CC=/usr/bin/colorgcc CFLAGS=-O2 -ggdb -Wall getpagetest: getpage.o getpagetest.c $(CC) $(CFLAGS) -std=c99 -lxml2 -lcurl -o getpagetest getpagetest.c getpage.o -I /usr/include/libxml2/ getpage.o: getpage.c $(CC) $(CFLAGS) -o getpage.o -Wall -std=c99 -fPIC getpage.c -I /usr/include/libxml2/ -c
Usage:
make && ./getpagetest heise.de index.html && chromium --proxy-server=localhost:1 index.html