C#(.Net6) 正則(正規) 表達式 HTML 快速 切割/拆解/分割/拆分 字串 資料 測試範例 [爬蟲(spider)基礎]
C#(.Net6) 正則(正規) 表達式 HTML 快速 切割/拆解/分割/拆分 字串 資料 測試範例 [爬蟲(spider)基礎]
資料來源: https://blog.miniasp.com/post/2010/04/27/How-to-filter-special-characters-using-NET-Regex
https://regex101.com/
GITHUB: https://github.com/jash-git/CS_Regex_HTML_Spider
Code:
using System.Text.RegularExpressions; namespace CS_Regex_HTML_Spider { class Program { static String Get_Html_stock_d(string data) { //D<i parameter1="">9</i><span data-char="↓" decimals="2" d="">14.23</span></li> String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"decimals=""2"" d="""">(([\s\S])*?)<"); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { StrResult = match.Groups[1].Value; } return StrResult; } static String Get_Html_stock_k(string data) { //K<i parameter1="">9</i><span data-char="↓" decimals="2" k="">8.90</span></li> String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"decimals=""2"" k="""">(([\s\S])*?)<"); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { StrResult = match.Groups[1].Value; } return StrResult; } static String Get_Html_stock_rsi2(string data) { //RSI<i parameter2="">10</i><span data-char="" decimals="2" rsi2="">20.78</span></li> String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"decimals=""2"" rsi2="""">(([\s\S])*?)<"); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { StrResult = match.Groups[1].Value; } return StrResult; } static String Get_Html_stock_rsi1(string data) { //RSI<i parameter1="">5</i><span data-char="↓" decimals="2" rsi1="">13.90</span></li> String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"decimals=""2"" rsi1="""">(([\s\S])*?)<"); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { StrResult = match.Groups[1].Value; } return StrResult; } static String Get_Html_stock_lastQuoteTime(string data) { //<time class="last-time ml-5" id="lastQuoteTime">2022-10-14 13:30</time> String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"id=""lastQuoteTime"">(([\s\S])*?)<"); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { StrResult = match.Groups[1].Value; } return StrResult; } static String Get_Html_a_url(string data) { String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"<a href=""(([\s\S])*?)"""); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { //Console.WriteLine("Found '{0}' at position {1}", match.Value, match.Index); StrResult = match.Value; StrResult = StrResult.Replace("<a href=", ""); StrResult = StrResult.Replace("\"", ""); } return StrResult; } static String Get_Html_img_url(string data) { String StrResult = ""; MatchCollection matches = Regex.Matches(data, @"<img src=""(([\s\S])*?)"""); // 一一取出 MatchCollection 內容 foreach (Match match in matches) { //Console.WriteLine("Found '{0}' at position {1}", match.Value, match.Index); StrResult = match.Value; StrResult = StrResult.Replace("<img src=", ""); StrResult = StrResult.Replace("\"", ""); } return StrResult; } static void Pause() { Console.Write("Press any key to continue..."); Console.ReadKey(true); } static void Main(string[] args) { string s00 = @"<img src=""https://example.com/media/photo.jpg"" with=""600"" heigh=""400"" alt=""一張圖片"">"; string s11 = @"<a href=""https://www.baidu.com"" rel=""nofollow"">链接标签</a>"; string s22 = @"<time class=""last-time ml-5"" id=""lastQuoteTime"">2022-10-14 13:30</time>"; string s33 = @"RSI<i parameter1="""">5</i><span data-char=""↓"" decimals=""2"" rsi1="""">13.90</span></li>"; string s44 = @"RSI<i parameter2="""">10</i><span data-char="""" decimals=""2"" rsi2="""">20.78</span></li>"; string s55 = @"K<i parameter1="""">9</i><span data-char=""↓"" decimals=""2"" k="""">8.90</span></li>"; string s66 = @"D<i parameter1="""">9</i><span data-char=""↓"" decimals=""2"" d="""">14.23</span></li>"; string r00 = Get_Html_img_url(s00); Console.WriteLine(r00); string r11 = Get_Html_a_url(s11); Console.WriteLine(r11); string r22 = Get_Html_stock_lastQuoteTime(s22); Console.WriteLine(r22); string r33 = Get_Html_stock_rsi1(s33); Console.WriteLine(r33); string r44 = Get_Html_stock_rsi2(s44); Console.WriteLine(r44); string r55 = Get_Html_stock_k(s55); Console.WriteLine(r55); string r66 = Get_Html_stock_d(s66); Console.WriteLine(r66); Pause(); } } }