csharp:正則表達式採集網頁數據

来源:http://www.cnblogs.com/geovindu/archive/2016/02/19/5200334.html
-Advertisement-
Play Games

https://msdn.microsoft.com/zh-cn/library/system.text.regularexpressions.regex(v=vs.110).aspx /// <summary> /// /// </summary> /// <param name="sender"


https://msdn.microsoft.com/zh-cn/library/system.text.regularexpressions.regex(v=vs.110).aspx

 /// <summary>
        /// 
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void StatsoneForm_Load(object sender, EventArgs e)
        {
            string s = @"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>    <td class='xl67' height='19' style='height:14.25pt;'></td>    <td class='xl71' x:num>110000</td>    <td class='xl71' x:str>北京市</td>    <td class='xl67'></td>    <td class='xl70'></td>    <td class='xl70'></td>    <td class='xl70'></td>    <td colspan='3' style='mso-ignore:colspan;'></td>   </tr>";
            string f = ExtensionPost(s);
            MessageBox.Show(f);
            string sb = @"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>北京市</span></p>";
            string fb = ExtensionPostb(sb);
            MessageBox.Show(fb);

            string strhtml = @"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>北京市</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110100<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>市轄區</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110101<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>東城區</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110102<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>西城區</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110105<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>朝陽區</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110106<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>丰台區</span></p>";
            IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>(?<code>\d+)<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>(?<name>\w*)</span></p>", strhtml);
            List<AreaInfo> areaList = (from v in htmlValue
                                      // let name = v.Name.Replace("自治區直轄縣級行政區劃", "縣").Replace("市轄區", "市").Replace("省直轄縣級行政區劃", "縣")
                                      // .Replace("縣", "")
                                       select new AreaInfo
                                       {
                                           AreaCode = v.Code.Substring(0, 5),
                                           AreaName = v.Name,
                                           AreaFullName = v.Name,
                                           ParentAreaCode = "0",
                                           ParentId = 0,
                                           CreateTime = DateTime.Now,
                                           AreaYear = 2015
                                       }).ToList();


            this.dataGridView1.DataSource = areaList;

        }
        /// <summary>
        /// <tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>
        /// <td class='xl67' height='19' style='height:14.25pt;'></td>
        /// <td class='xl71' x:num>654326</td>
        ///<td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>吉木乃縣</font></td>
        ///<td class='xl67'></td>
        ///<td class='xl70'></td>
        ///<td class='xl70'></td>
        ///<td class='xl70'></td>
        ///<td colspan='3' style='mso-ignore:colspan;'></td>
        ///</tr>
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        static String ExtensionPost(String url)
        {

            //<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>    <td class='xl67' height='19' style='height:14.25pt;'></td>    <td class='xl71' x:num>110000</td>    <td class='xl71' x:str>北京市</td>    <td class='xl67'></td>    <td class='xl70'></td>    <td class='xl70'></td>    <td class='xl70'></td>    <td colspan='3' style='mso-ignore:colspan;'></td>   </tr>
            //<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'> 北京市</span></p>

            //<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class='xl67' height='19' style='height:14.25pt;'></td><td class='xl71' x:num>654326</td><td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>吉木乃縣</font></td><td class='xl67'></td><td class='xl70'></td><td class='xl70'></td><td class='xl70'></td><td colspan='3' style='mso-ignore:colspan;'></td></tr>
           // Regex r = new Regex(@"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class='xl67' height='19' style='height:14.25pt;'></td><td class='xl71' x:num>(?<port>\d+)</td><td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>(?<proto>\w+)</font></td><td class='xl67'></td><td class='xl70'></td><td class='xl70'></td><td class='xl70'></td><td colspan='3' style='mso-ignore:colspan;'></td></tr>",
            Regex r = new Regex(@"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>    <td class='xl67' height='19' style='height:14.25pt;'></td>    <td class='xl71' x:num>(?<port>\d+)</td>    <td class='xl71' x:str>(?<proto>\w+)</td>    <td class='xl67'></td>    <td class='xl70'></td>    <td class='xl70'></td>    <td class='xl70'></td>    <td colspan='3' style='mso-ignore:colspan;'></td>   </tr>",
            RegexOptions.Compiled);
            return r.Match(url).Result("${proto}${port}");
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        static string ExtensionPostb(string url)
        {
            Regex r = new Regex(@"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>(?<port>\d+)<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>(?<proto>\w*)</span></p>",
            RegexOptions.Compiled);
            return r.Match(url).Result("${proto}${port}");
        }


        /// <summary>
        /// http://files2.mca.gov.cn/www/201512/20151224151630189.htm
        /// <tr height="19" style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class="xl67" height="19" style='height:14.25pt;'></td><td class="xl71" x:num>110000</td><td class="xl71" x:str>北京市</td><td class="xl67"></td><td class="xl70"></td><td class="xl70"></td><td class="xl70"></td><td colspan="3" style='mso-ignore:colspan;'></td></tr>
        /// http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html
        /// <p class="MsoNormal" style="line-height: 150%"><span lang="EN-US" style="line-height: 150%; font-family: 'Times New Roman', 'serif'; font-size: 12pt">110000<span>  </span></span><span style="line-height: 150%; font-family: 宋體; font-size: 12pt"> 北京市</span></p>
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button1_Click(object sender, EventArgs e)
        {
            try
            {
                //1
                string url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html";
                //2
                //string url = "http://files2.mca.gov.cn/www/201512/20151224151630189.htm";
                // 獲取相關HTML塊
                //IEnumerable<AreaHtmlValue> htmlValue =GetRegValue(@"<tr class='villagetr'><td>(?<code>\d{12})</td><td>(?<type>\d{3})</td><td>(?<name>\w*)</td></tr>",GetHtml(url));
                //<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'> 北京市</span></p>
                //1
                IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>(?<code>\d+)<span>  </span></span><span style='line-height: 150%; font-family: 宋體; font-size: 12pt'>(?<name>\w*)</span></p>", GetHtml(url));
                //2
                //IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class='xl67' height='19' style='height:14.25pt;'></td><td class='xl71' x:num>(?<code>\d+)</td><td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>(?<name>\w*)</font></td><td class='xl67'></td><td class='xl70'></td><td class='xl70'></td><td class='xl70'></td><td colspan='3' style='mso-ignore:colspan;'></td></tr>", GetHtml(url));
                //this.richTextBox1.Text = GetHtml(url);
               // this.textBox1.Text = GetHtml(url);
               // this.textBox1.SelectAll();
                List<AreaInfo> areaList = (from v in htmlValue
                                           //let name = v.Name.Replace("自治區直轄縣級行政區劃", "縣").Replace("市轄區", "市").Replace("省直轄縣級行政區劃", "縣")
                                                  //.Replace("縣", "")
                                             select new AreaInfo
                                              {
                                                  AreaCode = v.Code.Substring(0, 6),
                                                  AreaName = v.Name,
                                                  AreaFullName = v.Name,
                                                  ParentAreaCode = "0",
                                        
                                                  ParentId = 0,
                                                  CreateTime = DateTime.Now,
                                                  AreaYear = 2015
                                              }).ToList();

               
                this.dataGridView2.DataSource = areaList;

                WebClient wc = new WebClient();
               string mainData = Encoding.UTF8.GetString(wc.DownloadData(string.Format("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html")));
               this.richTextBox2.Text =RemoveScript(RemoveStyle(ReplaceEnter(mainData)));
              // this.textBox2.Text = RemoveScript(RemoveStyle(ReplaceEnter(mainData)));
              // this.textBox2.SelectAll();

            }
            catch (Exception ex)
            {
                ex.Message.ToString();
            }

        }
        #region 網頁源碼

        /// <summary>
        /// 
        /// </summary>
        /// <param name="url"></param>
        private static void updowndimg(string url)
        {
            WebClient client = new WebClient();
            string html = client.DownloadString(url);
            MatchCollection matches = Regex.Matches(html, "<img\\s*.*src=\"(.+?)\".*/>");
            for (int i = 0; i < matches.Count; i++)
            {
                string img = matches[i].Groups[1].Value.Replace("\"", string.Empty);
                img = "url/" + img;
                client.DownloadFile(img, @"c:\g\" + Path.GetFileName(img));
                Console.WriteLine(img);
            }
            Console.ReadKey();
        }
        /// <summary>
        /// 例如,Find_po在字開頭處查找以"po"開頭的字元串:
        /// </summary>
        static void Find_po()
        {
            string text = @" I can not find my position in Beijing ";
            string pattern = @"\bpo\S*ion\b";
            MatchCollection matches = Regex.Matches(text, pattern, RegexOptions.IgnoreCase
           | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);
            WriteMatches(text, matches);
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="text"></param>
        /// <param name="matches"></param>
        static void WriteMatches(string text, MatchCollection matches)
        {
            Console.WriteLine("Original text was: \n\n" + text + "\n");
            Console.WriteLine("No. of matches: " + matches.Count);
            foreach (Match nextMatch in matches)
            {
                int Index = nextMatch.Index;
                string result = nextMatch.ToString();
                int charsBefore = (Index < 5) ? Index : 5;
                int fromEnd = text.Length - Index - result.Length;
                int charsAfter = (fromEnd < 5) ? fromEnd : 5;
                int charsToDisplay = charsBefore + charsAfter + result.Length;
                Console.WriteLine("Index: {0}, \tString: {1}, \t{2}", Index, result,
                text.Substring(Index - charsBefore, charsToDisplay));
            }
        }
        /// <summary>
        /// 如,"http://www.yahoo.com.cn:8080/index.html"將返回"http:8080"。
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        String Extension(String url)
        {
            Regex r = new Regex(@"^(?<proto>\w+)://[^/]+?(?<port>:\d+)?/",
            RegexOptions.Compiled);
            return r.Match(url).Result("${proto}${port}");
        }
        /// <summary>
        ///     獲取遠程網頁源碼
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private static string GetHtml(string url)
        {
            try
            {
                WebRequest webRequest = WebRequest.Create(url);
                WebResponse webResponse = webRequest.GetResponse();
                Stream reader = webResponse.GetResponseStream();

                if (reader != null)
                {
                    var respStreamReader = new StreamReader(reader, Encoding.UTF8); //

                    var cbuffer = new char[1024];

                    int byteRead = respStreamReader.Read(cbuffer, 0, 256);

                    string strBuff = string.Empty;

                    while (byteRead != 0)
                    {
                        var strResp = new string(cbuffer, 0, byteRead);

                        strBuff = strBuff + strResp;

                        byteRead = respStreamReader.Read(cbuffer, 0, 256);
                    }

                    strBuff = RemoveScript(RemoveStyle(ReplaceEnter(strBuff)));

                    return strBuff;
                }
            }
            catch (Exception)
            {
                AreaLogHelper.WriteLogFile("【異常URL】" + url);
                Console.WriteLine("【異常URL】" + url);
            }

            return string.Empty;
        }

        /// <summary>
        ///     替換網頁中的換行和引號
        /// </summary>
        /// <param name="htmlCode">HTML源代碼</param>
        /// <returns></returns>
        private static string ReplaceEnter(string htmlCode)
        {
            if (string.IsNullOrEmpty(htmlCode))
                return string.Empty;
            return htmlCode.Replace("\r\n", "").Replace("\"", "'").Replace("\n", "").Replace("\r", "").Replace("   ", "").Replace("  ", "").Replace("    ", "").Replace("   ", "").Replace(" ", "").ToLower();//.Replace("\"", "").Replace(" ", "")
        }

           #region private methods 
            private static string RemoveComment(string input) 
            { 
                string result = input; 
                //remove comment 
                result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase); 
                return result; 
            } 
            private static string RemoveStyle(string input) 
            { 
                string result = input; 
                //remove all styles 
                result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline); 
                return result; 
            }
            private static string RemoveScript(string input) 
            { 
                string result = input; 
                result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline); 
                result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline); 
                return result; 
            }
            private static string RemoveTags(string input) 
            { 
                string result = input; 
                result = result.Replace(" ", " "); 
                result = result.Replace("", "\""); 
                result = result.Replace("<", "<"); 
                result = result.Replace(">", ">"); 
                result = result.Replace("&", "&"); 
                result = result.Replace("<br>", "\r\n"); 
                result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase); 
                return result; 
            } 
            #endregion
        /// <summary>
        ///  執行正則提取出值
        /// </summary>
        /// <param name="regexString">正則表達式</param>
        /// <param name="remoteStr">HtmlCode源代碼</param>
        /// <returns></returns>
        private static IEnumerable<AreaHtmlValue> GetRegValue(string regexString, string remoteStr)
        {
            var reg = new Regex(regexString, RegexOptions.Compiled);//RegexOptions.Compiled
            MatchCollection mc = reg.Matches(remoteStr);

            return (from Match m in mc
                    select new AreaHtmlValue
                    {
                        Code = m.Groups["code"].Value,
                        Name = m.Groups["name"].Value,
                       // Type = m.Groups["type"].Value
                    }).ToList();
        }

        private class AreaHtmlValue
        {
            public string Code { get; set; }
            public string Name { get; set; }

            public string Type { get; set; }
        }

        #endregion

    }

  


您的分享是我們最大的動力!

-Advertisement-
Play Games
更多相關文章
  • 話說有一天,臨近下班無心工作,在網上看各種文章,閱讀到了一篇名為《聊聊大麥網UWP版的首頁頂部圖片聯動效果的實現方法》(傳遞:http://www.cnblogs.com/hippieZhou/p/4755290.html),看到別人評論自己做的產品,頓時來了興趣,閱讀過後,hippieZhou童鞋
  • 近段時間,需要寫一個小功能,就是需要判斷程式是否已經運行。某個程式安裝後,也許被多個用戶運行。那怎樣判斷當前用戶已經運行了此程式了呢?下麵是Insus.NET的做法,就是:《VB.NET WinForm獲取運行程式用戶名》http://www.cnblogs.com/insus/p/5194839.
  • (轉)這裡給大家分享幾個VS版本,都是最終版的,也是中文版的!. Visual Studio 2005:http://pan.baidu.com/s/1c0eudyS Visual Studio 2008:http://pan.baidu.com/s/1i3GJ7pj Visual Studio 2
  • 以下非原創作品,但都是自己看過理解並寫過,記錄下來,以便之後項目的使用或其它用途。 (1)只需要簡單配置單一屬性值: 1 <configuration> 2 <configSections> 3 <!--配置讀取的全名稱--> 4 <section name="simple" type="Confi
  • 使用 HttpResponse 對象 HttpResponse 對象是與 HttpRequest 對象相對應的,用來表示構建中的響應。它當中提供了方法和屬性可供我們自定義響應,有一些在使用 MVC 視圖的時候很少使用到,但是在使用其他組件的時候可能十分有用,比如模塊是處理器。 同 HttpReque
  • HTTP請求工具類,適用於微信伺服器請求,可以自測 代碼; 1 /// <summary> 2 /// HTTP請求工具類 3 /// </summary> 4 public class HttpRequestUtil 5 { 6 #region 請求Url 7 8 #region 請求Url,不發
  • 在工作中我們經常會遇到格式轉換的問題,有的時候是將JSON轉換成DataTable、DataSet或是List等,也有可能將DataTable、DataSet或是List轉換成JSON的,抽了點時間把這些方法整合了一下,希望對大家有所幫助,如果有什麼問題請指出來,共同探討。 代碼: 1 using
  • 使用 HttpRequest 對象 HttpRequest 對象描述的是一個正在被處理的 HTTP 請求。下表列舉了 HttpRequest 中的屬性,它們提供了當前請求的相關信息(HttpRequest 類定義了一些方法和屬性,我們會逐步講解當中的一些屬性)。 表 1 – HttpRequest
一周排行
    -Advertisement-
    Play Games
  • 移動開發(一):使用.NET MAUI開發第一個安卓APP 對於工作多年的C#程式員來說,近來想嘗試開發一款安卓APP,考慮了很久最終選擇使用.NET MAUI這個微軟官方的框架來嘗試體驗開發安卓APP,畢竟是使用Visual Studio開發工具,使用起來也比較的順手,結合微軟官方的教程進行了安卓 ...
  • 前言 QuestPDF 是一個開源 .NET 庫,用於生成 PDF 文檔。使用了C# Fluent API方式可簡化開發、減少錯誤並提高工作效率。利用它可以輕鬆生成 PDF 報告、發票、導出文件等。 項目介紹 QuestPDF 是一個革命性的開源 .NET 庫,它徹底改變了我們生成 PDF 文檔的方 ...
  • 項目地址 項目後端地址: https://github.com/ZyPLJ/ZYTteeHole 項目前端頁面地址: ZyPLJ/TreeHoleVue (github.com) https://github.com/ZyPLJ/TreeHoleVue 目前項目測試訪問地址: http://tree ...
  • 話不多說,直接開乾 一.下載 1.官方鏈接下載: https://www.microsoft.com/zh-cn/sql-server/sql-server-downloads 2.在下載目錄中找到下麵這個小的安裝包 SQL2022-SSEI-Dev.exe,運行開始下載SQL server; 二. ...
  • 前言 隨著物聯網(IoT)技術的迅猛發展,MQTT(消息隊列遙測傳輸)協議憑藉其輕量級和高效性,已成為眾多物聯網應用的首選通信標準。 MQTTnet 作為一個高性能的 .NET 開源庫,為 .NET 平臺上的 MQTT 客戶端與伺服器開發提供了強大的支持。 本文將全面介紹 MQTTnet 的核心功能 ...
  • Serilog支持多種接收器用於日誌存儲,增強器用於添加屬性,LogContext管理動態屬性,支持多種輸出格式包括純文本、JSON及ExpressionTemplate。還提供了自定義格式化選項,適用於不同需求。 ...
  • 目錄簡介獲取 HTML 文檔解析 HTML 文檔測試參考文章 簡介 動態內容網站使用 JavaScript 腳本動態檢索和渲染數據,爬取信息時需要模擬瀏覽器行為,否則獲取到的源碼基本是空的。 本文使用的爬取步驟如下: 使用 Selenium 獲取渲染後的 HTML 文檔 使用 HtmlAgility ...
  • 1.前言 什麼是熱更新 游戲或者軟體更新時,無需重新下載客戶端進行安裝,而是在應用程式啟動的情況下,在內部進行資源或者代碼更新 Unity目前常用熱更新解決方案 HybridCLR,Xlua,ILRuntime等 Unity目前常用資源管理解決方案 AssetBundles,Addressable, ...
  • 本文章主要是在C# ASP.NET Core Web API框架實現向手機發送驗證碼簡訊功能。這裡我選擇是一個互億無線簡訊驗證碼平臺,其實像阿裡雲,騰訊雲上面也可以。 首先我們先去 互億無線 https://www.ihuyi.com/api/sms.html 去註冊一個賬號 註冊完成賬號後,它會送 ...
  • 通過以下方式可以高效,並保證數據同步的可靠性 1.API設計 使用RESTful設計,確保API端點明確,並使用適當的HTTP方法(如POST用於創建,PUT用於更新)。 設計清晰的請求和響應模型,以確保客戶端能夠理解預期格式。 2.數據驗證 在伺服器端進行嚴格的數據驗證,確保接收到的數據符合預期格 ...