前兩天朋友叫我模仿一個網站,剛剛開始,我一個頁面一個頁面查看源碼並複製和保存,花了我很多時間,一個字“累”,為了減輕工作量,我寫了個網站“克隆工具”,一鍵克隆,比起人工操作, 效率提高了200%以上,精確度也大大提高,雖然網上也很多網站克隆工具,但我覺得作為一個程式員,要有點研究精神,哈哈哈,可以根 ...
前兩天朋友叫我模仿一個網站,剛剛開始,我一個頁面一個頁面查看源碼並複製和保存,花了我很多時間,一個字“累”,為了減輕工作量,我寫了個網站“克隆工具”,一鍵克隆,比起人工操作,
效率提高了200%以上,精確度也大大提高,雖然網上也很多網站克隆工具,但我覺得作為一個程式員,要有點研究精神,哈哈哈,可以根據自己的需要隨意編寫自己需要的功能。
下麵我將我寫的“網站克隆工具”實現方法分享給大家,源碼在文末有下載鏈接,有需要的朋友可以下載來玩,也可以根據自己的需要做相應的修改或優化。
一睹為快,先看看界面:
簡單的工作流程:
項目代碼目錄結構:
下麵一步步實現程式功能:
1.新建主界面窗體(MainForm.cs):
2.新建模型類(UrlModel.cs)
public class UrlModel { public string RelatedPath { get; set; } public string AbsoluteUri { get; set; } public string CurrPath { get; set; } public string RootPath { get; set; } public string Host { get; set; } public int Port { get; set; } public string Scheme { get; set; } }
3.新建服務類(Services)
UrlParser:
public class UrlParser { public static UrlModel Parse(string url) { UrlModel model = new UrlModel(); //預設 if (url.Length < 8) throw new Exception("url參數不正確"); else if (!url.ToLower().StartsWith("http:") && !url.ToLower().StartsWith("https:")) throw new Exception("url格式有誤"); if (url.LastIndexOf('/') < 8) url = url + "/"; Regex reg = new Regex("(?<scheme>(http|https))://(?<host>.+?)/", RegexOptions.Singleline); if (reg.IsMatch(url)) { string scheme = reg.Match(url).Groups["scheme"].Value; string host = reg.Match(url).Groups["host"].Value; if (host.Contains(":")) { var aa = host.Split(':'); if (aa.Length == 2) { model.Host = aa[0]; model.Port = int.Parse(aa[1]); } } else { model.Host = host; model.Port = 80; } int index = url.IndexOf('/', 8); model.RelatedPath = url.Substring(index); model.AbsoluteUri = url; model.Scheme = scheme; model.CurrPath = url.Substring(0, url.LastIndexOf("/")); if (80 == model.Port) { model.RootPath = string.Format("{0}://{1}", model.Scheme, model.Host); } else { model.RootPath = string.Format("{0}://{1}:{2", model.Scheme, model.Host, model.Port); } } else { throw new Exception("url解析失敗!"); } return model; } }
WebPageService:
/// <summary> /// 網頁處理服務工具 /// </summary> public class WebPageService { private static string[] excludekeys = { "http:", "https:", "//", "#", "javascript:", "?", "tel:", "mailto:" }; /// <summary> /// 獲取所有html元素的href屬性值,只獲取站點本地的鏈接,站外的不獲取 /// </summary> /// <param name="html">頁面的html源碼</param> /// <returns></returns> public static List<UrlModel> GetLocalHrefs(string url,string html) { if (string.IsNullOrEmpty(html)) return new List<UrlModel>(); Dictionary<string, UrlModel> urls = GetHrefs(url,html); List<UrlModel> newUrls = new List<UrlModel>(); if (null != urls) { foreach (string key in urls.Keys) { string newkey = key.ToLower(); bool iscontained = false; foreach (var exkey in excludekeys) { if (newkey.IndexOf(exkey) == 0) { iscontained = true; break; } } if (!iscontained) { //只獲取本地路徑 newUrls.Add(urls[key]); } } } return newUrls; } /// <summary> /// 獲取所有html元素的src屬性值,只獲取站點本地的鏈接,站外的不獲取 /// </summary> /// <param name="html">頁面的html源碼</param> /// <returns></returns> public static List<UrlModel> GetLocalSrcs(string url,string html) { if (string.IsNullOrEmpty(html)) return new List<UrlModel>(); Dictionary<string, UrlModel> urls = GetSrc(url, html); List<UrlModel> newUrls = new List<UrlModel>(); if (null != urls) { foreach (string key in urls.Keys) { string newkey = key.ToLower(); bool iscontained = false; foreach (var exkey in excludekeys) { if (newkey.IndexOf(exkey) == 0) { iscontained = true; break; } } if (!iscontained) { //只獲取本地路徑 newUrls.Add(urls[key]); } } } return newUrls; } private static Dictionary<string, UrlModel> GetHrefs(string url,string html) { if (string.IsNullOrEmpty(html)) return null; UrlModel currUrl = UrlParser.Parse(url); Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>(); Regex reg = new Regex("href=\"(?<Url>.+?)\"", RegexOptions.IgnoreCase); if (currUrl != null) { AddUrlModel(html, currUrl, urls, reg); } return urls; } private static Dictionary<string, UrlModel> GetSrc(string url,string html) { if (string.IsNullOrEmpty(html)) return null; UrlModel currUrl = UrlParser.Parse(url); Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>(); Regex reg = new Regex("(src=\"(?<Url>.+?)\"|url\\((?<Url>.+?)\\))", RegexOptions.IgnoreCase); if (currUrl != null) { AddUrlModel(html, currUrl, urls, reg); } return urls; } private static void AddUrlModel(string html, UrlModel currUrl, Dictionary<string, UrlModel> urls, Regex reg) { if (reg.IsMatch(html)) { MatchCollection matchs = reg.Matches(html); foreach (Match item in matchs) { try { string strUrl = item.Groups["Url"].Value; UrlModel model = new UrlModel(); model.RelatedPath = strUrl; model.CurrPath = currUrl.CurrPath; model.RootPath = currUrl.RootPath; model.Scheme = currUrl.Scheme; model.Port = currUrl.Port; model.Host = currUrl.Host; if (strUrl.StartsWith("/")) { //絕對目錄情況下 model.AbsoluteUri = string.Format("{0}{1}", model.RootPath, model.RelatedPath); } else { //相對目錄情況下 string currPath = model.CurrPath; int depth = 0; string path = model.RelatedPath; if (path.StartsWith("..")) { try { while (path.StartsWith("..")) { depth++; path = path.Substring(3); currPath = currPath.Substring(0, currPath.LastIndexOf("/")); } model.AbsoluteUri = string.Format("{0}/{1}", currPath, path); } catch { } } else { model.AbsoluteUri = string.Format("{0}/{1}", currPath, path); } } strUrl = strUrl.Trim().ToLower(); urls.Add(strUrl, model); } catch { } } } } }
4.網頁源碼扒取類
public class HttpTool { public static string HttpGet(string url, string referer, string encoding, out string msg) { msg = string.Empty; string result = string.Empty; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); //request.ContentType = "application/x-www-form-urlencoded"; request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; request.Referer = referer; request.Method = "GET"; request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36"; //request.Headers.Add("Accept-Language", "zh-cn"); //request.Headers.Add("Accept-Encoding", "gzip,deflate"); request.Timeout = 60000;//一分鐘 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream responseStream = response.GetResponseStream(); if (responseStream != null) { StreamReader reader = new StreamReader(responseStream, System.Text.Encoding.GetEncoding(encoding)); result = reader.ReadToEnd(); reader.Close(); responseStream.Close(); request.Abort(); response.Close(); return result.Trim(); } } catch (Exception ex) { msg = ex.Message + ex.StackTrace; } return result; } public static void DownFile(string uRLAddress, string localPath, string filename) { WebClient client = new WebClient(); Stream str = client.OpenRead(uRLAddress); StreamReader reader = new StreamReader(str); byte[] mbyte = new byte[1000000]; int allmybyte = (int)mbyte.Length; int startmbyte = 0; while (allmybyte > 0) { int m = str.Read(mbyte, startmbyte, allmybyte); if (m == 0) { break; } startmbyte += m; allmybyte -= m; } reader.Dispose(); str.Dispose(); string path = Path.Combine(localPath, filename); FileStream fstr = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write); fstr.Write(mbyte, 0, startmbyte); fstr.Flush(); fstr.Close(); } }
5.網站克隆主類
介面:
interface IWebCloneWorker
{
void Start();
void Cancel();
}
實現類:
public class WebCloneWorker : IWebCloneWorker
{
//網站頁面克隆深度(如:0-首頁,1-分類頁,2-詳細頁面)
public static int depth = 0;
//要克隆的網站網址
public string Url { get; set; }
//克隆後,保存的路徑
public string SavePath { get; set; }
private BackgroundWorker backgroundWorker1 = null;
public event UrlChangedEventHandler UrlChanged;
public event FileSavedSuccessEventHandler FileSavedSuccess;
public event FileSavedFailEventHandler FileSavedFail;
public event DownloadCompletedEventHandler DownloadCompleted;
public event CollectingUrlEventHandler CollectingUrl;
public event CollectedUrlEventHandler CollectedUrl;
public event ProgressChangedEventHandler ProgressChanged;
//所有頁面、文件資源地址集合
private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>();
/// <summary>
/// 所有頁面、文件資源地址集合
/// </summary>
public Dictionary<string,UrlModel> Hrefs
{
get { return _Hrefs; }
set { _Hrefs = value; }
}
//網站頁面請求編碼,預設為UTF-8
private string _Encoding = "utf-8";
//網站頁面請求編碼,預設為UTF-8
public string Encoding
{
get { return _Encoding; }
set { _Encoding = value; }
}
public WebCloneWorker() { }
public WebCloneWorker(string url,string path)
{
//設置網站、保存路徑
this.Url = url;
this.SavePath = path;
if (string.IsNullOrEmpty(this.Url))
throw new Exception("請輸入網址");
if (string.IsNullOrEmpty(this.SavePath))
throw new Exception("請選擇要保存的目錄");
backgroundWorker1 = new BackgroundWorker();
//設置報告進度更新
backgroundWorker1.WorkerReportsProgress = true;
backgroundWorker1.WorkerSupportsCancellation = true;
//註冊線程主體方法
backgroundWorker1.DoWork += backgroundWorker1_DoWork;
//註冊更新UI方法
backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged;
//處理完畢
backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted;
}
void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
{
if (e.Cancelled) {
return;
}
if (this.DownloadCompleted != null)
{
DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled);
this.DownloadCompleted(this, eventArgs);
}
}
void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e)
{
//進度回調
if (this.ProgressChanged != null)
this.ProgressChanged(this, e);
UrlModel model = (UrlModel)e.UserState;
if (this.UrlChanged != null)
{
//Url改變後,回調
UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model);
this.UrlChanged(this, eventArgs);
}
try
{
string dir = this.SavePath;
string url = model.AbsoluteUri;
string AbsolutePath = url.Substring(url.IndexOf('/', 8));
string fileName = "";
if (url.IndexOf('?') > 0)
{
string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf('?'));
fileName = System.IO.Path.GetFileName(path);
}
else
{
fileName = System.IO.Path.GetFileName(AbsolutePath);
}
//預設首頁
if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0)
{
fileName = "index.html";
if (!AbsolutePath.EndsWith("/"))
AbsolutePath = AbsolutePath + "/";
}
fileName = System.Web.HttpUtility.UrlDecode(fileName);
string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath));
if (!System.IO.Directory.Exists(localPath))
{
System.IO.Directory.CreateDirectory(localPath);
}
//判斷文件是否存在,存在不再下載
string path2 = Path.Combine(localPath, fileName);
if (File.Exists(path2))
{
return;
}
//下載網頁、圖片、資源文件
HttpTool.DownFile(url, localPath, fileName);
//保存成功後,回調
if (this.FileSavedSuccess != null)
{
FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model);
this.FileSavedSuccess(this, eventArgs);
}
}
catch (Exception ex)
{
//保存失敗後,回調
if (this.FileSavedFail != null)
{
FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex);
this.FileSavedFail(this, eventArgs);
}
}
}
void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//獲取資源
GetResource();
int index = 1;
if (this.Hrefs.Keys.Count > 0)
{
foreach (var k in this.Hrefs.Keys)
{
//取消操作
if (backgroundWorker1.CancellationPending)
{
e.Cancel = true;
return;
}
backgroundWorker1.ReportProgress(index, this.Hrefs[k]);
index++;
//掛起當前線程200毫秒
Thread.Sleep(200);
}
}
}
public void Start()
{
if (this.backgroundWorker1.IsBusy)
return;
this.backgroundWorker1.RunWorkerAsync();
}
public void Cancel()
{
if (this.backgroundWorker1.CancellationPending)
return;
this.backgroundWorker1.CancelAsync();
}
private void GetResource()
{
string url = this.Url;
string referer = this.Url;
string msg = "";
string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg);
//收集頁面鏈接
GetHrefs(0, url, html);
//收集完畢
if (null != CollectedUrl)
{
UrlModel urlModel = new UrlModel();
CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel);
this.CollectedUrl(this, eventArgs);
}
}
private void GetHrefs(int level,string url,string html)
{
#region 添加當前頁
UrlModel currUrl = UrlParser.Parse(url);
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(currUrl.RelatedPath, currUrl);
//收集回調
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl);
this.CollectingUrl(this, eventArgs);
}
}
catch
{
}
#endregion
//獲取相關鏈接(含有href屬性的)
List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html);
//獲取圖片,文件等資源文件(含有src屬性的)
List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html);
#region 獲取當級資源文件
if (listSrcs != null)
{
for (int i = 0; i < listSrcs.Count; i++)
{
UrlModel urlModel = listSrcs[i];
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(urlModel.RelatedPath, urlModel);
//收集回調
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
this.CollectingUrl(this, eventArgs);
}
}
catch
{ }
}
}
#endregion
#region 獲取子級頁面資源
//獲取第二級
if (list1 != null)
{
for (int i = 0; i < list1.Count; i++)
{
UrlModel urlModel = list1[i];
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(urlModel.RelatedPath, urlModel);
//收集回調
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel);
this.CollectingUrl(this, eventArgs);
}
}
catch
{ }
string msg = "";
html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg);
#region 獲取子級資源文件
/*
* 獲取二級資源文件
* */
listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//資源文件
if (listSrcs != null)
{
for (int j = 0; j < listSrcs.Count; j++)
{
UrlModel urlModel2 = listSrcs[j];
try
{
//取消
if (backgroundWorker1.CancellationPending)
return;
this.Hrefs.Add(urlModel2.RelatedPath, urlModel2);
//收集回調
if (null != CollectingUrl)
{
CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2);
this.CollectingUrl(this, eventArgs);
}
}
catch
{ }
//掛起線程20毫秒
Thread.Sleep(20);
}
}
#endregion
//掛起線程20毫秒
Thread.Sleep(20);
//到達指定深度後,退出
if (level >= depth)
return;
//遞歸
GetHrefs(level + 1, urlModel.AbsoluteUri, html);
}
}
#endregion
}
}
6.一些事件、委托類:
public delegate void UrlChangedEventHandler(object sender, UrlChangedEventArgs e);
public delegate void FileSavedSuccessEventHandler(object sender, FileSavedSuccessEventArgs e);
public delegate void FileSavedFailEventHandler(object sender, FileSavedFailEventArgs e);
public delegate void DownloadCompletedEventHandler(object sender, DownloadCompletedEventArgs e);
public delegate void CollectingUrlEventHandler(object sender, CollectingUrlEventArgs e);
public delegate void CollectedUrlEventHandler(object sender, CollectedUrlEventArgs e);
public delegate void ProgressChangedEventHandler(object sender, ProgressChangedEventArgs e);
public class CollectedUrlEventArgs : EventArgs
public class CollectingUrlEventArgs : EventArgs
public class DownloadCompletedEventArgs : EventArgs
public class FileSavedFailEventArgs : EventArgs
public class FileSavedSuccessEventArgs : EventArgs
public class UrlChangedEventArgs : EventArgs
代碼有點多,各位有需要的還是下載源碼查看並運行吧,由於趕時間,沒時間仔細測試程式的各個功能,難免有不足的地方。
百度網盤:鏈接:https://pan.baidu.com/s/1hja1rl9UEcl0dzTqVFt0dg 密碼:7s6r