Tag Archives: curl

Website downloader

This is a newer version (especially the source code) of the former blog entry at spin.
How does it work?

  • use libxml to parse html
  • scan css files for image urls
  • sorry, but no javascript :(
  • correct relative and absolute urls
  • download all these files

Le sourcecode:

#include
#include
#include
#include
#include <libxml/parser.h>
#include <libxml/HTMLparser.h>
#include <libxml/xmlerror.h>
#include <curl/curl.h>
#include

#include "getpage.h"

#define FILELENGTH 150
#define CURL_TIMEOUT_SEC 240
#define SELECT_TIMEOUT_SEC 10
#define MAX_P_FILE_DOWNLOADS 10

#define DEBUG

static char ALPHABET[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890";

/*
Get out:
  * TODO: frames
  * TODO: javascript file riddles ;)
  * TODO: wrong content-length -> reload
  * TODO: same file in 2 different CSS files

*/

enum FILETYPE
{
  IMG = 0x1,
  STYLE = 0x2,
  SCRIPT = 0x4,
  IFRAME = 0x8,
  FRAME = 0x10,
  PDF = 0x20,
  CSS_IMG = 0x40,
  NONE = 0x80
};

struct _replace_info
{
        char *begin;
        char end;

        void (*userfunction) (void*, char*, int, bool);
  void *userdata;

        char *buffer;
        int begin_progress;
        int begin_length;
        bool inside_gap;
        int status;
};

struct _site_files
{
  char *url;
  char *url2;
  char *filename;
  enum FILETYPE ft;
  struct _site_files *next;
  FILE *fp;
  struct _replace_info *ri;
  short nth_url;
#ifdef DEBUG
  int id;
  bool done;
#endif
};

struct _site_userdata
{
        //void (*site_function)(void*, const char*, ...);
        void (*site_function)(void*, const char*, va_list);
        void *userdata;

  struct _site_files *sf;
  char *_base_url;
  bool _utf8_meta_set;
  CURL *_mhnd;
  CURL *_hnd;
};

struct _css_filter_userdata
{
  struct _site_userdata *su;
  char *url;
};

struct _css_filter_save_userdata
{
  struct _site_userdata *su;
  FILE *fp;
  char *filename;
  char *url;
  char *_css_base_url;
};

static char *_filetype_string(enum FILETYPE ft)
{
  char *txt;
  switch(ft)
  {
    case IMG: txt = "IMG"; break;
    case CSS_IMG: txt = "CSS_IMG"; break;
    case STYLE: txt = "STYLE"; break;
    case SCRIPT:  txt = "SCRIPT"; break;
    case IFRAME:  txt = "IFRAME"; break;
    case FRAME: txt = "FRAME"; break;
    case PDF:  txt = "PDF"; break;
    case NONE: txt =  "OTHER"; break;
    default: txt = "DEFAULT"; break;
  }

  return txt;
}

static void _user_function(struct _site_userdata *su, const char *fmt, ...)
{
  va_list ap;
  va_start(ap, fmt);
  su->site_function(su->userdata, fmt, ap);
  va_end(ap);
}

static char *__join_together(char *a, char *b, int len_b)
{
        int len_a = 0;
        int i;
        char *new;

        if (a != NULL)
                len_a += strlen(a);

        new = realloc(a, len_b+1+len_a);

        if (new != NULL)
        {
                for (i = 0; i < len_b; i++)                         new[i+len_a] = b[i];         new[len_a+len_b] = '\0';         }         return new; } // return true if inside gap -> 1
// return false if outside gap -> -1
static int inline replace_step(struct _replace_info *ri, char txt)
{
        if (txt == ri->begin[ri->begin_progress])
                ri->begin_progress++;
  else
    ri->begin_progress = 0;

        if (ri->begin_progress == ri->begin_length)
        {
                ri->begin_progress = 0;
                ri->inside_gap = true;
                return -1;
        }

        if (ri->inside_gap)
        {
                if (txt == ri->end)
                {
                        ri->inside_gap = false;
                        return -1;
                }
                else
                        return 1;
        }

        return -1;
}

static void replace(struct _replace_info *ri, char *txt, int length)
{
        int i;
        int offset = 0;
        int status_temp = -1;

        for (i = 0; i < length; i++)         {                 status_temp = replace_step(ri, txt[i]);                 if (ri->status != status_temp)
                {
                        if (ri->buffer != NULL)
                        {
                                if (ri->status == 1)
                                {
                                        ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), true);
                                }
                                else if (ri->status == -1)
                                {
                                        ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), false);
                                }
                                free(ri->buffer);
                                ri->buffer = NULL;
                        }

                        if (ri->status == 1)
                        {
                                ri->userfunction(ri->userdata, txt+offset, i-offset, true);
                        }
                        else if (ri->status == -1)
                        {
                                ri->userfunction(ri->userdata, txt+offset, i-offset, false);
                        }

                        offset = i;
                }
                ri->status = status_temp;
        }

  if (offset != length)
        {
                if (status_temp == 1 || status_temp == -1)
                {
                        ri->userfunction(ri->userdata, txt+offset, i-offset, ri->status == 1 ? true : false);
                }
                else
                {
                        if (txt[length-1] == '\0')
                        {
                                if (ri->buffer != NULL)
                                {
                                        ri->userfunction(ri->userdata, ri->buffer, strlen(ri->buffer), false);
                                }
                                free(ri->buffer);
                                ri->buffer = NULL;
                                ri->userfunction(ri->userdata, txt+offset, length-offset, false);
                        }
                        else
                                ri->buffer = __join_together(ri->buffer, txt+offset, length-offset);
                }
        }

}

static void _set_chnd(CURL *hnd, char *url, void *cbfunction, void *userdata)
{
  curl_easy_setopt(hnd, CURLOPT_INFILESIZE_LARGE, (curl_off_t)-1);
  curl_easy_setopt(hnd, CURLOPT_URL, url);
  curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1);
  curl_easy_setopt(hnd, CURLOPT_FAILONERROR, 0);
  curl_easy_setopt(hnd, CURLOPT_USERAGENT, "libmessage - btwotch+libmessage@gmail.com");
  //curl_easy_setopt(hnd, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.62 Safari/534.3");
  curl_easy_setopt(hnd, CURLOPT_RESUME_FROM_LARGE, (curl_off_t)0);
  curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50);
  curl_easy_setopt(hnd, CURLOPT_SSLVERSION, 0);
  curl_easy_setopt(hnd, CURLOPT_TIMECONDITION, 0);
  curl_easy_setopt(hnd, CURLOPT_TIMEVALUE, 0);
  curl_easy_setopt(hnd, CURLOPT_CUSTOMREQUEST, NULL);
  curl_easy_setopt(hnd, CURLOPT_CONNECTTIMEOUT, CURL_TIMEOUT_SEC);
  curl_easy_setopt(hnd, CURLOPT_TIMEOUT, CURL_TIMEOUT_SEC);
  curl_easy_setopt(hnd, CURLOPT_HTTPAUTH, 1);
  curl_easy_setopt(hnd, CURLOPT_ENCODING, NULL);
  curl_easy_setopt(hnd, CURLOPT_IPRESOLVE, 0);
  curl_easy_setopt(hnd, CURLOPT_IGNORE_CONTENT_LENGTH, 0);
  curl_easy_setopt(hnd, CURLOPT_POSTREDIR, 0);
  curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, cbfunction);
  curl_easy_setopt(hnd, CURLOPT_WRITEDATA, userdata);
  curl_easy_setopt(hnd, CURLOPT_FOLLOWLOCATION, 1);
  curl_easy_setopt(hnd, CURLOPT_NOSIGNAL, 1);
  curl_easy_setopt(hnd, CURLOPT_AUTOREFERER, 1);
  curl_easy_setopt(hnd, CURLOPT_ENCODING, "deflate");
  curl_easy_setopt(hnd, CURLOPT_SSL_VERIFYHOST, 1); // TODO
}

static void _filename_gen(struct _site_files *first_sf, char *filename)
{
  int i;
  bool name_double;
  struct _site_files *sf;

  do
  {
    name_double = false;
    srand(1337^filename[0]);

    for (i = FILELENGTH/2; i < FILELENGTH; i++)       filename[i] = ALPHABET[rand()% (strlen(ALPHABET)-1)];     filename[FILELENGTH-1] = '\0';     sf = first_sf;     while (sf != NULL && sf->filename != NULL)
    {
      if (!strcasecmp(sf->filename, filename))
        name_double = true;

      sf = sf->next;
    }
  } while (name_double);
}

static char* _shrink_url(char *rurl) // remove apostrophes etc.
{
  int length;

  while (rurl[0] != '\0' && rurl[0] == ' ')
    rurl++;
  length = strlen(rurl);
  for (int i = 0; i < length/2; i++)     if (rurl[i] == '\'' || rurl[i] == '\"')     {       if (rurl[i] == rurl[length-i-1])       {         rurl[length-i-1] = '\0';         rurl++;       }     }     else       break;   return rurl; } static void _crap_sites_aburl(char **abs_url, CURL *hnd, char *rurl, char *static_burl) {   int abs_urllen;   if (!strncasecmp(rurl, "//", 2)) // gmx-hack   {     abs_urllen = 5+strlen(rurl)+1;     *abs_url = malloc(abs_urllen*sizeof(char));     snprintf(*abs_url, abs_urllen, "http:%s", rurl);   }    } static void _relative_aburl(char **abs_url, CURL *hnd, char *rurl, char *static_burl, short nth_url) {   int abs_urllen;   int domain_end = 0, i;   int base_len = 0;   char *burl = NULL;   if (strncasecmp(rurl, "http://", 7) &&      strncasecmp(rurl, "https://", 8) &&      strncasecmp(rurl, "ftp://", 6) &&      strncasecmp(rurl, "file://", 7) &&      strncasecmp(rurl, "about:", 6) &&     strncasecmp(rurl, "javascript:", 11))   {     if (static_burl != NULL)       burl = static_burl;     else       if (curl_easy_getinfo(hnd, CURLINFO_EFFECTIVE_URL, &burl) != CURLE_OK)       {         fprintf(stderr, "CURLINFO_EFFECTIVE_URL failed\n");         exit(1);       }             if (!strncasecmp(burl, "http://", 7))       domain_end = 7;     else if (!strncasecmp(burl, "https://", 8))       domain_end = 8;     else if (!strncasecmp(burl, "ftp://", 6))       domain_end = 6;     else if (!strncasecmp(burl, "file://", 6))       domain_end = strlen(burl);     if (nth_url > 0)
      for (i = domain_end+1; i < strlen(burl); i++)
      {
        if (burl[i] == '/')
        {
          if (i < strlen(burl)-1)             if (burl[i+1] == '/')               continue;           if (nth_url == 1)           {             base_len = i;             break;           }           else             nth_url--;         }       }     if (nth_url == -1)       for (i = strlen(burl); i > domain_end; i--)
        if (burl[i] == '/')
        {
          base_len = i;
          break;
        }

    if (base_len == 0)
      base_len = strlen(burl);

    abs_urllen = strlen(rurl) + strlen(burl) + 2;
    *abs_url = malloc(sizeof(char)*abs_urllen);
    snprintf(*abs_url, abs_urllen, "%.*s/%s", base_len, burl, rurl);
  }

}

static char* _absolute_url(CURL *hnd, char *rurl, char *static_burl, short nth_url)
{
  char *abs_url = NULL;

  if (nth_url == 1)
  {
    _crap_sites_aburl(&abs_url, hnd, rurl, static_burl);
    if (abs_url != NULL)
      return abs_url;
  }

  _relative_aburl(&abs_url, hnd, rurl, static_burl, nth_url);
  if (abs_url != NULL)
    return abs_url;

  if (nth_url == 1)
  {
    int abs_urllen = strlen(rurl)+1;
    abs_url = malloc(abs_urllen+1);
    strncpy(abs_url, rurl, abs_urllen);
    //abs_url = strdup(rurl);
  }

  return abs_url;

}

static char *_site_files_add(struct _site_userdata *su, char *url, char *base_url, enum FILETYPE ft)
{
  struct _site_files *sf = su->sf;
  char *newurl, *newfilename, *sec_url;
  int i;
  int url_length;
  int filename_length;

  if (url == NULL)
    return NULL;

  url_length = strlen(url)+1;

  //printf("%s  %s\n", su->_base_url, base_url);
  url = _shrink_url(url);
  newurl = _absolute_url(su->_hnd, url, (base_url != NULL) ? base_url : su->_base_url, 1);
  sec_url = _absolute_url(su->_hnd, url, (base_url != NULL) ? base_url : su->_base_url, -1);

  if (sf != NULL)
  {
    if (!strcmp(sf->url, newurl))
    {
      free(newurl);
      free(sec_url);
      return sf->filename;
    }
    while (sf->next != NULL)
    {
      sf = sf->next;
      if (!strcmp(sf->url, newurl))
      {
        free(newurl);
        free(sec_url);
        return sf->filename;
      }
    }
    sf->next = malloc(sizeof(struct _site_files));
    sf = sf->next;
    sf->ri = NULL;
    sf->next = NULL;
  }
  else
  {
    sf = malloc(sizeof(struct _site_files));
    sf->ri = NULL;
    su->sf = sf;
    sf->next = NULL;
  }

  sf->filename = NULL;
  sf->ft = ft;
  sf->url = newurl;
  sf->url2 = sec_url;

  filename_length = strlen(newurl)+1;
  if (filename_length > FILELENGTH)
    filename_length = FILELENGTH;

  newfilename = malloc(sizeof(char)*(filename_length));
  strncpy(newfilename, sf->url, filename_length);
  if (filename_length == FILELENGTH)
    _filename_gen(sf, newfilename);

  sf->filename = newfilename;

  for (i = 0; i < strlen(sf->filename); i++)
  {
    if (sf->filename[i] == '/')
      sf->filename[i] = '_';
    else if (sf->filename[i] == '?')
      sf->filename[i] = '_';
    else if (sf->filename[i] == '#')
      sf->filename[i] = '_';
    else if (sf->filename[i] == '@')
      sf->filename[i] = '_';
    else if (sf->filename[i] == '%')
      sf->filename[i] = '_';
    else if (sf->filename[i] == ':')
      sf->filename[i] = '_';
    else if (sf->filename[i] == ' ')
      sf->filename[i] = '_';
  }

  return sf->filename;
}

void _save_file_css_save(void *userdata, char *gap, int length, bool gapped)
{
  struct _css_filter_save_userdata *cfsu = (struct _css_filter_save_userdata*) userdata;
  char *filename;

  if (cfsu->fp == NULL)
  { // first call of this func.
    cfsu->fp = fopen(cfsu->filename, "w");
    cfsu->url = NULL;
  }

  if (gapped)
    cfsu->url = __join_together(cfsu->url, gap, length);
  else if (!gapped && cfsu->url != NULL)
  {
    filename = _site_files_add(cfsu->su, cfsu->url, cfsu->_css_base_url, CSS_IMG);
    fprintf(cfsu->fp, "%s", filename);
    free(cfsu->url);
    cfsu->url = NULL;
    fprintf(cfsu->fp, "%.*s", length, gap);
  }
  else
    fprintf(cfsu->fp, "%.*s", length, gap);

}

size_t _save_file_css(char *txt, size_t size, size_t nmemb, struct _site_files *sf) // feed the replacer!
{
  if (size == 0 && nmemb == 0 && sf->fp != NULL)
  {
    fclose(sf->fp);
  }
  else if (sf->fp == NULL)
      sf->fp=fopen(sf->filename, "w");

  if (sf->fp == NULL)
  {
    perror("fopen");
    return 0;
  }

  replace(sf->ri, txt, size*nmemb);

  return size*nmemb;
}

size_t _save_file(char *txt, size_t size, size_t nmemb, struct _site_files *sf)
{
  int i;

  if (size == 0 && nmemb == 0 && sf->fp != NULL)
  {
    fclose(sf->fp);
  }
  else if (sf->fp == NULL)
      sf->fp=fopen(sf->filename, "w");

  if (sf->fp == NULL)
  {
    perror("fopen");
    return 0;
  }

  for (i = 0; i < size*nmemb; i++)     fputc(txt[i], sf->fp);

  return size*nmemb;
}

static void _set_css_ri(struct _replace_info *ri, void *userdata, void *userfunction)
{
  ri->begin = "url(";
  ri->end = ')';

  ri->userfunction = userfunction;
  ri->userdata = userdata;

  ri->buffer = NULL;

  ri->begin_progress = 0;
  ri->begin_length = 4;
  ri->inside_gap = false;

  ri->status = -1;
}

static int _add_download_files(struct _site_files *sf, struct _site_userdata *su, CURL *mhnd, short nth_url)
{
  struct _css_filter_save_userdata *cfsu;
  CURL *hnd;

  sf->fp = NULL;
  sf->nth_url = nth_url;

#ifdef DEBUG
  static int id;
  sf->id = id++;
  fprintf(stderr, "Download (id: %d, %s) %s -> %s\n", id, _filetype_string(sf->ft), sf->url, sf->filename);
#endif

  if (sf->ft == STYLE)
  {
    sf->ri = malloc(sizeof(struct _replace_info));
    cfsu = malloc(sizeof(struct _css_filter_save_userdata));
    cfsu->_css_base_url = sf->url;
    cfsu->fp = NULL;
    cfsu->filename = sf->filename;
    cfsu->su = su;
    _set_css_ri(sf->ri, cfsu, _save_file_css_save);
    if (nth_url == 1)
    {
      hnd = curl_easy_init();
      _set_chnd(hnd, sf->url, _save_file_css, sf);
    }
    else if (nth_url == 2)
    {
      if (sf->url2 != NULL)
      {
        hnd = curl_easy_init();
        _set_chnd(hnd, sf->url2, _save_file_css, sf);
      }
      else
        return -1;
    }
    else
      return -1;
  }
  else
  {
    if (nth_url == 1)
    {
      hnd = curl_easy_init();
      _set_chnd(hnd, sf->url, _save_file, sf);
    }
    else if (nth_url == 2)
    {
      if (sf->url2 != NULL)
      {
        hnd = curl_easy_init();
        _set_chnd(hnd, sf->url2, _save_file, sf);
      }
      else
        return -1;
    }
    else
      return -1;
  }

  curl_easy_setopt(hnd, CURLOPT_PRIVATE, sf);
  curl_multi_add_handle(mhnd, hnd);

#ifdef DEBUG
  sf->done = false;
#endif

  return 1;
}

static void _download_files(struct _site_userdata *su)
{
  int handles = 1, msgs_in_queue, maxfd;
  int iteration = 0;
  int downloads = 0; // current downloads
  char *curlinfo_private;
  CURL *mhnd;
  CURLMsg *cmsg;
  struct _site_files *first_sf = su->sf;
  struct _site_files *sf = first_sf;
  struct _site_files *tmp_sf;
  struct timeval timeout;
  fd_set fdread, fdwrite, fderr;
  char *burl;
#ifdef DEBUG
  char *ip;
#endif
  long response_code;

  if (sf == NULL)
    return;
  mhnd = curl_multi_init();

  if (_add_download_files(sf, su, mhnd, 1) > 0)
    downloads++;
  sf = sf->next;

  while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) && handles != 0);

  do
  {
    iteration++;
    FD_ZERO(&fdread);
    FD_ZERO(&fdwrite);
    FD_ZERO(&fderr);
    timeout.tv_sec = SELECT_TIMEOUT_SEC;
    timeout.tv_usec = 0;
    curl_multi_fdset(mhnd, &fdread, &fdwrite, &fderr, &maxfd);
    switch(select(maxfd+1, &fdread, &fdwrite, &fderr, &timeout))
    {
      case -1:
#ifdef DEBUG
        fprintf(stderr, "select bad :(\n");
        perror("!!! select failed ");
        while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
        {
          if (cmsg->data.result != 0)
          {
            curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIMARY_IP, &ip);
            fprintf(stderr, "ip: %s url: %s result: %d", ip, burl, cmsg->data.result);
            if (cmsg->data.result == 7)
              fprintf(stderr, " (couldn't connect)");
            fprintf(stderr, "\n");
          }
        }
        fprintf(stderr, "-----------\n");
#endif
      default:
        while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
          if (cmsg->msg == CURLMSG_DONE)
          {
            curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE, &curlinfo_private);
            tmp_sf = (struct _site_files*)curlinfo_private;
            if (tmp_sf->ft == CSS_IMG)
              _save_file_css(NULL, 0, 0, tmp_sf);
            else
              _save_file(NULL, 0, 0, tmp_sf);

            downloads--;
#ifdef DEBUG
            tmp_sf->done = true;
#endif

            curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl);
            curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE, &response_code);
            if (response_code >= 400)
            {
              if (tmp_sf->nth_url == 2)
              {
                fprintf(stderr, "Failed (%ld): %s -> %s (%s)   ", response_code, burl, tmp_sf->filename, _filetype_string(tmp_sf->ft));
                fprintf(stderr, "second url: %s\n", tmp_sf->url2);
              }
              else
              {
                if (_add_download_files(tmp_sf, su, mhnd, 2) > 0)
                  downloads++;
              }
            }

            curl_easy_cleanup(cmsg->easy_handle);
          }
        do
        {
          // download 1st file
          if (iteration == 1 && sf != NULL && downloads < MAX_P_FILE_DOWNLOADS)           {             if (_add_download_files(sf, su, mhnd, 1) > 0)
              downloads++;
          }

          while (sf != NULL && sf->next != NULL && downloads < MAX_P_FILE_DOWNLOADS) // sf->next must not be NULL as we are adding to the list ;)
          {
            if (_add_download_files(sf->next, su, mhnd, 1) > 0)
              downloads++;
            sf = sf->next;
          }
        } while (CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mhnd, &handles) && handles != 0);

        break;

    }
  } while(handles != 0);

  while ((cmsg = curl_multi_info_read(mhnd, &msgs_in_queue)) != NULL)
  {
    curl_easy_getinfo(cmsg->easy_handle, CURLINFO_PRIVATE, &curlinfo_private);
    tmp_sf = (struct _site_files*)curlinfo_private;
    if (tmp_sf->ft == CSS_IMG)
      _save_file_css(NULL, 0, 0, tmp_sf);
    else
      _save_file(NULL, 0, 0, tmp_sf);
    downloads--;

    curl_easy_getinfo(cmsg->easy_handle, CURLINFO_EFFECTIVE_URL, &burl);
    curl_easy_getinfo(cmsg->easy_handle, CURLINFO_RESPONSE_CODE, &response_code);
    if (response_code >= 400)
    {
      if (tmp_sf->nth_url == 2)
      {
        fprintf(stderr, "Failed (%ld): %s -> %s (%s)   ", response_code, burl, tmp_sf->filename, _filetype_string(tmp_sf->ft));
        fprintf(stderr, "second url: %s\n", tmp_sf->url2);
      }
      else
      {
        if (_add_download_files(tmp_sf, su, mhnd, 2) > 0)
          downloads++;
      }
    }

    curl_easy_cleanup(cmsg->easy_handle);
#ifdef DEBUG
    tmp_sf->done = true;
#endif
  }

  sf = first_sf;
  while (sf != NULL)
  {
#ifdef DEBUG
    printf("id: %d url: %s url2: %s done: %d\n", sf->id, sf->url, sf->url2, sf->done);
#endif
    free(sf->url);
    free(sf->url2);
    free(sf->filename);
    if (sf->ri != NULL)
    {
      free(sf->ri->userdata);
      free(sf->ri);
    }
    tmp_sf = sf;
    sf = sf->next;
    free(tmp_sf);
  }

#ifdef DEBUG
  if (downloads != 0)
  {
    printf("!!downloads: %d (%s)\n", downloads, su->_base_url);
    exit(-1);
  }
#endif
  curl_multi_cleanup(mhnd);

}

void _css_filter(void *userdata, char *gap, int length, bool gapped)
{
  struct _css_filter_userdata *cfu = (struct _css_filter_userdata*) userdata;
  char *filename;

  if (gapped)
    cfu->url = __join_together(cfu->url, gap, length);
  else if (!gapped && cfu->url != NULL)
  {
    filename = _site_files_add(cfu->su, cfu->url, NULL, CSS_IMG);
    _user_function(cfu->su, "%s", filename);
    free(cfu->url);
    cfu->url = NULL;
    _user_function(cfu->su, "%.*s", length, gap);
  }
  else
    _user_function(cfu->su, "%.*s", length, gap);
}

static void _getpage_startElementSAX (void * userData, const xmlChar * name, const xmlChar ** atts)
{
  int i, j;
  char *n = (char*)name;
  char *filename, *url;
  struct _site_userdata *su = userData;
  struct _css_filter_userdata cfu;
  struct _replace_info ri;

  _user_function(su, "<%s", n);   if (atts != NULL)     for (i = 0; atts[i] != NULL; i+=2)     {       filename = NULL;       if (!strncasecmp(n, "img", 4) && !strncasecmp((char*)atts[i], "src", 4))       {         filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG);         _user_function(su, " src=\"file:%s\"", filename);       }       else if (!strncasecmp(n, "input", 6) && !strncasecmp((char*)atts[i], "src", 4))       {         filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG);         _user_function(su, " src=\"file:%s\"", filename);       }       else if (!strncasecmp(n, "script", 7) && !strncasecmp((char*)atts[i], "src", 4))       {         filename = _site_files_add(su, (char*)atts[i+1], NULL, SCRIPT);         _user_function(su, " src=\"file:%s\"", filename);       }       else if (!strncasecmp(n, "iframe", 7) && !strncasecmp((char*)atts[i], "src", 4))       {         filename = _site_files_add(su, (char*)atts[i+1], NULL, IFRAME);         _user_function(su, " src=\"file:%s\"", filename);       }       else if (!strncasecmp((char*)atts[i], "style", 6))       {         cfu.su = su;         cfu.url = NULL;         _set_css_ri(&ri, &cfu, _css_filter);         _user_function(su, " style=\"");         replace(&ri, (char*)atts[i+1], strlen((char*)atts[i+1]));         if (cfu.url != NULL)           free(cfu.url);         _user_function(su, "\"");         filename = (void*)-1;       }       else if (!strncasecmp(n, "link", 5) && !strncasecmp((char*)atts[i], "href", 5))       {         for (j = 0; atts[j] != NULL; j+=2)           if (!strncasecmp((char*)atts[j], "rel", 4))            {             if (!strncasecmp((char*)atts[j+1], "stylesheet", 11))             {               filename = _site_files_add(su, (char*)atts[i+1], NULL, STYLE);               _user_function(su, " href=\"file:%s\"", filename);             }             else if (!strncasecmp((char*)atts[j+1], "icon", 5))             {               filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG);               _user_function(su, " href=\"file:%s\"", filename);             }             else if (!strncasecmp((char*)atts[j+1], "shortcut icon", 14))             {               filename = _site_files_add(su, (char*)atts[i+1], NULL, IMG);               _user_function(su, " href=\"file:%s\"", filename);             }           }       }       else if (!strncasecmp(n, "a", 2) && !strncasecmp((char*)atts[i], "href", 5))       {         url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url, 1);
        _user_function(su, " href=\"%s\"", url);
        free(url);
        filename = (void*)-1;
      }
      else if (!strncasecmp(n, "base", 5) && !strncasecmp((char*)atts[i], "href", 5))
      {
        _user_function(su, " href=\".\"");
        filename = (void*)-1;
      }
      else if (!strncasecmp(n, "form", 5) && !strncasecmp((char*)atts[i], "action", 7))
      {
        url = _absolute_url(su->_hnd, (char*)atts[i+1], su->_base_url, 1);
        _user_function(su, " action=\"%s\"", url);
        free(url);
        filename = (void*)-1;
      }
      else if (!strncasecmp(n, "meta", 5) && !strncasecmp((char*)atts[i], "http-equiv", 8) && !strncasecmp((char*)atts[i+1], "Content-Type", 13))
      {
        su->_utf8_meta_set = true;
        _user_function(su, " http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"");
        //filename = (void*)-1;
        break;
      }

      if (filename == NULL)
        _user_function(su, " %s=\"%s\"", (char*)atts[i], (char*)atts[i+1]);

    }

  _user_function(su, ">");

}

static void _getpage_endElementSAX (void * userData, const xmlChar * name)
{
  char *n = (char*)name;
  struct _site_userdata *su = userData;

  if (!strncasecmp("head", n, 5) && !su->_utf8_meta_set)
    _user_function(su, "<meta http-equiv="\&quot;Content-Type\&quot;" content="\&quot;text/html;" charset="utf-8\&quot;/" /> ");
  else if (strncasecmp("br", n, 3) && strncasecmp("img", n, 4) && strncasecmp("meta", n, 5) && strncasecmp("link", n, 5) && strncasecmp("input", n, 5))
    _user_function(su, "\n", n);
}

static void _getpage_charDataSAX (void * userData, const xmlChar * buffer, int length)
{
  struct _site_userdata *su = userData;
  _user_function(su, "%.*s", length, buffer);
}

static size_t _chunk_parse(void *ptr, size_t size, size_t nmemb, xmlParserCtxtPtr ctxt)
{
  char *txt = ptr;
#ifdef DEBUG
  FILE *fp = fopen("bare.txt", "a+");

  fprintf(fp, "%.*s", (int)(size*nmemb), txt);
  fclose(fp);
#endif
  htmlParseChunk(ctxt, txt, size*nmemb, 0);

  return nmemb*size;
}

void getpage(char *url, void *site_function, void *userdata)
{
  struct _site_userdata su;
  su.site_function = site_function;
  su.userdata = userdata;
  su.sf = NULL;
  su._utf8_meta_set = false;
  su._base_url = NULL;
  CURLcode ret;

  htmlSAXHandler hsh;
  htmlParserCtxtPtr ctxt;

#ifdef DEBUG
  remove("bare.txt");
#endif

  memset(&hsh, 0, sizeof(htmlSAXHandler));

  hsh.startElement = _getpage_startElementSAX;
  hsh.endElement = _getpage_endElementSAX;
  hsh.characters = _getpage_charDataSAX;

  ctxt = htmlCreatePushParserCtxt(&hsh, &su, NULL, 0, NULL, XML_CHAR_ENCODING_UTF8);
  htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER);

  curl_global_init(CURL_GLOBAL_ALL);
  su._hnd = curl_easy_init();
  _set_chnd(su._hnd, url, _chunk_parse, ctxt);
  ret = curl_easy_perform(su._hnd);

  htmlParseChunk(ctxt, NULL, 0, 1);
  htmlFreeParserCtxt(ctxt);

  curl_easy_getinfo(su._hnd, CURLINFO_EFFECTIVE_URL, &su._base_url);

#ifdef DEBUG
  double val;
  if (curl_easy_getinfo(su._hnd, CURLINFO_SPEED_DOWNLOAD, &val) == CURLE_OK)
    printf("Average download speed: %0.3f kbyte/sec.\n", val / 1024);
#endif

  fprintf(stderr, "Downloading files ...\n");
  _download_files(&su);
  curl_easy_cleanup(su._hnd);
  // curl_global_cleanup();
}

(save it as getpage.c)

To use that library:

#include
#include

#include "getpage.h"

void site_function(void *userdata, const char* format, va_list ap)
{
  FILE *fp = userdata;

  vfprintf(fp, format, ap);
  fflush(fp);
}

int main(int argc, char **argv)
{

  FILE *fp = fopen(argv[2], "w");
  if (fp == NULL)
    return -1;

  getpage(argv[1], site_function, fp);

  fclose(fp);
}

(save that as getpagetest.c)

and now the Makefile (tabulators!!):

CC=/usr/bin/colorgcc
CFLAGS=-O2 -ggdb -Wall

getpagetest: getpage.o getpagetest.c
        $(CC) $(CFLAGS) -std=c99 -lxml2 -lcurl -o getpagetest getpagetest.c getpage.o -I /usr/include/libxml2/

getpage.o: getpage.c
        $(CC) $(CFLAGS) -o getpage.o -Wall -std=c99 -fPIC getpage.c -I /usr/include/libxml2/ -c

Usage:

 make && ./getpagetest heise.de index.html && chromium --proxy-server=localhost:1 index.html