最近對爬蟲很感興趣,稍微研究了一下,利用HtmlAgilityPack製作了一個十分簡單的爬蟲,這個簡易爬蟲只能獲取靜態頁面的Html ...
最近對爬蟲很感興趣,稍微研究了一下,利用HtmlAgilityPack製作了一個十分簡單的爬蟲,這個簡易爬蟲只能獲取靜態頁面的Html
HtmlAgilityPack簡介
HtmlAgilityPack是一個解析速度十分快,並且開源的Html解析工具,並且HtmlAgilityPack支持使用Xpath解析Html,能夠幫助我們解析Html文檔就像解析Xml文檔一樣輕鬆、方便。
C#安裝HtmlAgilityPack
- 如果VS安裝有Nuget,在Nuget直接搜索安裝即可。
- 下載後解壓縮後有3個文件,這裡只需要將其中的HtmlAgilityPack.dll、HtmlAgilityPack.xml引入解決方案中即可使用
實例(獲取某頁面圖片)
載入HTML頁面
//從網頁中載入
string url = "https://www.bilibili.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument hd = web.Load(url);
利用WebClient寫一個圖片下載器
需要using System.Net
和using System.IO
/// <summary>
/// 圖片下載器
/// </summary>
public class ImgDownloader
{
/// <summary>
/// 下載圖片
/// </summary>
/// <param name="webClient"></param>
/// <param name="url">圖片url</param>
/// <param name="folderPath">文件夾路徑</param>
/// <param name="fileName">圖片名</param>
public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName)
{
//如果文件夾不存在,則創建一個
if (!Directory.Exists(folderPath))
{
Directory.CreateDirectory(folderPath);
}
//判斷路徑是否完整,補全不完整的路徑
if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1)
{
url = "https:" + url;
}
//下載圖片
try
{
webClient.DownloadFile(url, folderPath + fileName);
Console.WriteLine(fileName + "下載成功");
}
catch (Exception ex)
{
Console.Write(ex.Message);
Console.WriteLine(url);
}
}
}
通過Xpath獲取img標簽中的圖片
string imgPath = "//img";//選擇img
int imgNum = 0;//圖片編號
//獲取img標簽中的圖片
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(imgPath))
{
if (node.Attributes["src"] != null)
{
string imgUrl = node.Attributes["src"].Value.ToString();
if (imgUrl != "" && imgUrl != " ")
{
imgNum++;
//生成文件名,自動獲取尾碼
string fileName = imgNum + imgUrl.Substring(imgUrl.LastIndexOf("."));
ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName);
}
}
}
通過Xpath獲取背景圖
//獲取背景圖
string bgImgPath = "//*[@style]";//選擇具有style屬性的節點
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(bgImgPath))
{
if (node.Attributes["style"].Value.Contains("background-image:url"))
{
imgNum++;
string bgImgUrl = node.Attributes["style"].Value;
bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//讀取url()的內容
//Console.WriteLine(bgImgUrl);
//生成文件名,自動獲取尾碼
string fileName = imgNum + bgImgUrl.Substring(bgImgUrl.LastIndexOf("."));
ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName);
}
}
完整代碼
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.IO;
using HtmlAgilityPack;
using System.Text.RegularExpressions;
namespace WebCrawlerDemo
{
class Program
{
static void Main(string[] args)
{
WebClient wc = new WebClient();
string url = "https://www.bilibili.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument hd = web.Load(url);//下載html頁面
string imgPath = "//img";//選擇img
int imgNum = 0;//圖片編號
//獲取img標簽中的圖片
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(imgPath))
{
if (node.Attributes["src"] != null)
{
string imgUrl = node.Attributes["src"].Value.ToString();
if (imgUrl != "" && imgUrl != " ")
{
imgNum++;
//生成文件名,自動獲取尾碼
string fileName = imgNum + imgUrl.Substring(imgUrl.LastIndexOf("."));
ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName);
}
}
}
//獲取背景圖
string bgImgPath = "//*[@style]";//選擇具有style屬性的節點
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(bgImgPath))
{
if (node.Attributes["style"].Value.Contains("background-image:url"))
{
imgNum++;
string bgImgUrl = node.Attributes["style"].Value;
bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//讀取url()的內容
//生成文件名,自動獲取尾碼
string fileName = imgNum + bgImgUrl.Substring(bgImgUrl.LastIndexOf("."));
ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName);
}
}
Console.WriteLine("----------END----------");
Console.ReadKey();
}
}
/// <summary>
/// 圖片下載器
/// </summary>
public class ImgDownloader
{
/// <summary>
/// 下載圖片
/// </summary>
/// <param name="webClient"></param>
/// <param name="url">圖片url</param>
/// <param name="folderPath">文件夾路徑</param>
/// <param name="fileName">圖片名</param>
public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName)
{
//如果文件夾不存在,則創建一個
if (!Directory.Exists(folderPath))
{
Directory.CreateDirectory(folderPath);
}
//判斷路徑是否完整,補全不完整的路徑
if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1)
{
url = "https:" + url;
}
//下載圖片
try
{
webClient.DownloadFile(url, folderPath + fileName);
Console.WriteLine(fileName + "下載成功");
}
catch (Exception ex)
{
Console.Write(ex.Message);
Console.WriteLine(url);
}
}
}
}