一、xxxxxx獲取指定任務爬取的所有url的介面 介面名稱:xxxxxx獲取指定任務爬取的所有url的介面 訪問鏈接: http://IP:PORT/crwalTask/findUrlExceptionById?ctId=ctIdVal&time=timeVal&limit=limitVal 傳入 ...
一、xxxxxx獲取指定任務爬取的所有url的介面
介面名稱:xxxxxx獲取指定任務爬取的所有url的介面
訪問鏈接:
http://IP:PORT/crwalTask/findUrlExceptionById?ctId=ctIdVal&time=timeVal&limit=limitVal
傳入參數類型:String,int
參數內容:
返回類型:JSONArray
返回內容:
調用方法Demo
1 public static void main(String[] args) throws Exception { 2 //爬蟲訪問介面地址 3 String req_url = "http://192.168.1.105:8080/crwalTask/findUrlExceptionById?ctId=ctIdVal&time=timeVal&limit=limitVal"; 4 JSONArray jsonArray = httpRequest(req_url,"ba716af7-105c-481b-bf28-2e9231529947",SelectUtil.time,SelectUtil.number);//200 5 System.out.println(jsonArray); 6 } 7 8 public class SelectUtil { 9 public static final String time = "2018-03-05".replaceAll(" ", "=");//按時間篩選 格式"yyyy-mm-dd"或"yyyy-mm-dd HH:mm:ss" 10 public static final int number = 162;//查詢限制數量 11 } 12 /** 13 * 獲取指定任務爬取的所有url信息 14 * @param req_url 訪問指定任務爬取的url的鏈接地址 15 * @param ctId 指定的任務Id 16 * @param time 查詢時間 17 * @param limit 查詢限制的條數 18 * @return 19 */ 20 public static JSONArray httpRequest(String req_url,String ctId,String time,int limit) { 21 req_url = req_url.replace("ctIdVal",ctId); 22 req_url = req_url.replace("timeVal",time); 23 req_url = req_url.replace("limitVal",String.valueOf(limit)); 24 StringBuffer buffer = new StringBuffer(); 25 JSONArray jsonArray = null; 26 try { 27 URL url = new URL(req_url); 28 HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection(); 29 30 httpUrlConn.setDoOutput(false); 31 httpUrlConn.setDoInput(true); 32 httpUrlConn.setUseCaches(false); 33 34 httpUrlConn.setRequestMethod("POST"); 35 httpUrlConn.connect(); 36 37 // 將返回的輸入流轉換成字元串 38 InputStream inputStream = httpUrlConn.getInputStream(); 39 InputStreamReader inputStreamReader = new InputStreamReader(inputStream, "utf-8"); 40 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 41 42 String str = null; 43 while ((str = bufferedReader.readLine()) != null) { 44 buffer.append(str); 45 } 46 bufferedReader.close(); 47 inputStreamReader.close(); 48 // 釋放資源 49 inputStream.close(); 50 inputStream = null; 51 httpUrlConn.disconnect(); 52 if("".equals(buffer.toString())){ 53 String exception = "[\"exception\",\"查詢的記錄數超過240\"]"; 54 55 jsonArray = JSONArray.fromObject(exception); 56 }else{ 57 jsonArray = JSONArray.fromObject(buffer.toString()); 58 } 59 } catch (Exception e) { 60 System.out.println(e.getMessage()); 61 } 62 63 return jsonArray; 64 }View Code
需要的Jar包:
commons-beanutils-1.9.3.jar
commons-collections-3.2.2.jar
commons-lang-2.6.jar
commons-logging-1.2.jar
ezmorph-1.0.6.jar
json-lib-2.4-jdk15.jar
Sql腳本
alter table urlpathmapper add exceptionInfo varchar(2048) comment 'URL運行錯誤信息'
alter table urlpathmapper add title varchar(256) comment '爬取標題'
alter table crawltaskmanage add checkFile varchar(8) comment '文件是否校驗 0是 1否'
alter table crawltaskmanage add SimHashValue int(8) comment 'SimHash演算法重覆度比較值'