Coverage Report - net.sf.jmatchparser.util.CachingDownloader
 
Classes in this File Line Coverage Branch Coverage Complexity
CachingDownloader
0%
0/75
0%
0/22
2,5
 
 1  
 /*
 2  
  * Copyright (c) 2006 - 2011 Michael Schierl
 3  
  * 
 4  
  * All rights reserved.
 5  
  * 
 6  
  * Redistribution and use in source and binary forms, with or without
 7  
  * modification, are permitted provided that the following conditions
 8  
  * are met:
 9  
  * 
 10  
  * - Redistributions of source code must retain the above copyright notice,
 11  
  *   this list of conditions and the following disclaimer.
 12  
  *   
 13  
  * - Redistributions in binary form must reproduce the above copyright
 14  
  *   notice, this list of conditions and the following disclaimer in the
 15  
  *   documentation and/or other materials provided with the distribution.
 16  
  *   
 17  
  * - Neither name of the copyright holders nor the names of its
 18  
  *   contributors may be used to endorse or promote products derived from
 19  
  *   this software without specific prior written permission.
 20  
  *   
 21  
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 22  
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23  
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24  
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 25  
  * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 26  
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 27  
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 28  
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 29  
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 30  
  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 31  
  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32  
  */
 33  
 package net.sf.jmatchparser.util;
 34  
 
 35  
 import java.io.File;
 36  
 import java.io.FileInputStream;
 37  
 import java.io.FileOutputStream;
 38  
 import java.io.IOException;
 39  
 import java.io.InputStream;
 40  
 import java.io.InputStreamReader;
 41  
 import java.io.OutputStreamWriter;
 42  
 import java.io.PrintStream;
 43  
 import java.io.Reader;
 44  
 import java.io.StringWriter;
 45  
 import java.net.Proxy;
 46  
 import java.net.URL;
 47  
 import java.net.URLConnection;
 48  
 import java.util.HashMap;
 49  
 import java.util.Map;
 50  
 import java.util.Random;
 51  
 
 52  
 /**
 53  
  * A utility class used for downloading files from the Internet for parsing.
 54  
  * This class provides a built-in disk cache to avoid redownload of the same
 55  
  * URLs after a crash or after fixing a bug.
 56  
  */
 57  
 public class CachingDownloader {
 58  
 
 59  
         private final File cachePath;
 60  
         private final Proxy proxy;
 61  
         private final String cookies, useragent, forwardedFor;
 62  
         private final long delay;
 63  0
         private final Random rnd = new Random();
 64  
         private final int blacklistedSize;
 65  0
         private final Map<String, int[]> cacheNameLocks = new HashMap<String, int[]>();
 66  0
         private PrintStream debugStream = System.err;
 67  
 
 68  
         /**
 69  
          * Create a new caching downloader that stores its cache in the given
 70  
          * directory.
 71  
          * 
 72  
          * @param cachePath
 73  
          *            directory to store the cached files into
 74  
          */
 75  
         public CachingDownloader(File cachePath) {
 76  0
                 this(cachePath, null, null);
 77  0
         }
 78  
 
 79  
         /**
 80  
          * Create a new caching downloader with support for custom user agents and
 81  
          * cookies
 82  
          * 
 83  
          * @param cachePath
 84  
          *            directory to store the cached files into
 85  
          * @param cookies
 86  
          *            Value for the <tt>Cookie</tt> header
 87  
          * @param useragent
 88  
          *            Value for the <tt>User-Agent</tt> header
 89  
          */
 90  
         public CachingDownloader(File cachePath, String cookies, String useragent) {
 91  0
                 this(cachePath, Proxy.NO_PROXY, cookies, useragent, null, 0, -1);
 92  0
         }
 93  
 
 94  
         /**
 95  
          * Create a new caching downloader with all supported options.
 96  
          * 
 97  
          * @param cachePath
 98  
          *            Directory to store the cached files into
 99  
          * @param proxy
 100  
          *            Proxy to use for downloading
 101  
          * @param cookies
 102  
          *            Value for the <tt>Cookie</tt> header
 103  
          * @param useragent
 104  
          *            Value for the <tt>User-Agent</tt> header
 105  
          * @param forwardedFor
 106  
          *            Value for the <tt>X-Forwarded-For</tt> header. Every
 107  
          *            <tt>*</tt> will be replaced by a random number between 0 and
 108  
          *            255 for each request
 109  
          * @param delay
 110  
          *            Delay to wait before each download (useful if the target site
 111  
          *            blocks excess downloaders)
 112  
          * @param blacklistedSize
 113  
          *            Size of a proxy error page. If the response has the given
 114  
          *            size, the download will be repeated. Only needed if the proxy
 115  
          *            uses status code 200 for its error pages.
 116  
          */
 117  0
         public CachingDownloader(File cachePath, Proxy proxy, String cookies, String useragent, String forwardedFor, long delay, int blacklistedSize) {
 118  0
                 this.cachePath = cachePath;
 119  0
                 this.cookies = cookies;
 120  0
                 this.useragent = useragent;
 121  0
                 this.forwardedFor = forwardedFor;
 122  0
                 this.delay = delay;
 123  0
                 this.proxy = proxy;
 124  0
                 this.blacklistedSize = blacklistedSize;
 125  0
         }
 126  
 
 127  
         /**
 128  
          * Download the given file.
 129  
          * 
 130  
          * @param url
 131  
          *            URL of the file
 132  
          * @param cacheName
 133  
          *            Name to use for the file in the cache (must be a valid file
 134  
          *            name)
 135  
          * @return An input stream to read from the file
 136  
          */
 137  
         public InputStream download(String url, String cacheName) throws IOException {
 138  0
                 return download(url, null, cacheName);
 139  
         }
 140  
 
 141  
         /**
 142  
          * Download the given file using user defined POST data.
 143  
          * 
 144  
          * @param url
 145  
          *            URL of the file
 146  
          * @param postdata
 147  
          *            POST data to send to the URL, or <code>null</code> to not use
 148  
          *            any POST data
 149  
          * @param cacheName
 150  
          *            Name to use for the file in the cache (must be a valid file
 151  
          *            name)
 152  
          * @return An input stream to read from the file
 153  
          */
 154  
         public InputStream download(String url, String postdata, String cacheName) throws IOException {
 155  0
                 File cacheFile = new File(cachePath, cacheName);
 156  
                 int[] lock;
 157  0
                 synchronized (cacheNameLocks) {
 158  0
                         lock = cacheNameLocks.get(cacheName);
 159  0
                         if (lock == null) {
 160  0
                                 lock = new int[] { 0 };
 161  0
                                 cacheNameLocks.put(cacheName, lock);
 162  
                         }
 163  0
                         lock[0]++;
 164  0
                 }
 165  
                 try {
 166  0
                         synchronized (lock) {
 167  0
                                 if (!cacheFile.exists()) {
 168  0
                                         download(url, postdata, cacheFile);
 169  
                                 }
 170  0
                         }
 171  
                 } finally {
 172  0
                         synchronized (cacheNameLocks) {
 173  0
                                 lock[0]--;
 174  0
                                 if (lock[0] == 0) {
 175  0
                                         cacheNameLocks.remove(cacheName);
 176  
                                 }
 177  0
                         }
 178  0
                 }
 179  0
                 return new FileInputStream(cacheFile);
 180  
         }
 181  
 
 182  
         /**
 183  
          * Download the given file using user defined POST data into the given cache
 184  
          * file. This (private) method does not enforce any lockin, so you have to
 185  
          * make sure yourself not to call it twice for the same file.
 186  
          * 
 187  
          * @param url
 188  
          *            URL of the file
 189  
          * @param postdata
 190  
          *            POST data to send to the URL, or <code>null</code> to not use
 191  
          *            any POST data
 192  
          * @param cacheFile
 193  
          *            File to use to store the download to
 194  
          */
 195  
         private void download(String url, String postdata, File cacheFile) throws IOException {
 196  0
                 if (delay > 0) {
 197  0
                         debugStream.println("Waiting for " + url + " " + postdata);
 198  
                         try {
 199  0
                                 Thread.sleep(delay);
 200  0
                         } catch (InterruptedException ex) {
 201  0
                         }
 202  
                 }
 203  0
                 debugStream.println("Downloading " + url + " " + postdata);
 204  0
                 URLConnection conn = new URL(url).openConnection(proxy);
 205  0
                 if (cookies != null) {
 206  0
                         conn.setRequestProperty("Cookie", cookies);
 207  
                 }
 208  0
                 if (useragent != null) {
 209  0
                         conn.setRequestProperty("User-Agent", useragent);
 210  
                 }
 211  0
                 if (forwardedFor != null) {
 212  0
                         String ff = forwardedFor;
 213  0
                         synchronized (rnd) {
 214  0
                                 while (ff.indexOf('*') != -1) {
 215  0
                                         int pos = ff.indexOf('*');
 216  0
                                         ff = ff.substring(0, pos) + rnd.nextInt(256) + ff.substring(pos + 1);
 217  0
                                 }
 218  0
                         }
 219  0
                         conn.setRequestProperty("X-Forwarded-For", ff);
 220  
                 }
 221  0
                 if (postdata != null) {
 222  0
                         conn.setDoOutput(true);
 223  0
                         OutputStreamWriter wr = new OutputStreamWriter(conn.getOutputStream(), "ISO-8859-1");
 224  0
                         wr.write(postdata);
 225  0
                         wr.flush();
 226  
                 }
 227  0
                 InputStream in = conn.getInputStream();
 228  0
                 FileOutputStream out = new FileOutputStream(cacheFile);
 229  0
                 StreamForwarder.forward(in, out);
 230  0
                 if (blacklistedSize >= 0 && cacheFile.length() == blacklistedSize) {
 231  0
                         debugStream.println("------ Blacklisted size, retrying...");
 232  0
                         cacheFile.delete();
 233  0
                         download(url, postdata, cacheFile);
 234  
                 }
 235  0
         }
 236  
 
 237  
         /**
 238  
          * Load an {@link InputStream} completely into a {@link String}.
 239  
          * 
 240  
          * @param in
 241  
          *            Stream to read from
 242  
          * @param encoding
 243  
          *            Encoding to use
 244  
          * @return complete content of the stream
 245  
          */
 246  
         public static String loadStream(InputStream in, String encoding) throws IOException {
 247  0
                 Reader r = new InputStreamReader(in, encoding);
 248  0
                 StringWriter sw = new StringWriter();
 249  0
                 StreamForwarder.forward(r, sw);
 250  0
                 return sw.toString();
 251  
         }
 252  
         
 253  
         /**
 254  
          * Set the debug stream where status information is written to
 255  
          * ({@link System#err} by default).
 256  
          * 
 257  
          * @param debugStream
 258  
          *            New debug stream to use
 259  
          */
 260  
         public void setDebugStream(PrintStream debugStream) {
 261  0
                 this.debugStream = debugStream;
 262  0
         }
 263  
 }