你的位置:首页 > ASP.net教程

[ASP.net教程]C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素


Html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。

 1 /// <summary> 2 /// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素 3 /// </summary> 4 /// <param name="hrml"></param> 5 /// <returns></returns> 6 private string HtmlToCsv(string hrml) 7 { 8   HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); 9   doc.LoadHtml(hrml); 10   StringBuilder sbLines = new StringBuilder(); 11   HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table"); 12   if (tList != null) 13   { 14     foreach (HtmlAgilityPack.HtmlNode table in tList) 15     { 16       sbLines.AppendLine("#flag_table#,"); 17       HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr"); 18       if (rows != null) 19       { 20         int colCount = 0; 21         StringBuilder sbTable = new StringBuilder(); 22         foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td")) 23         { 24           HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"]; 25           int colspan = (attr != null) ? int.Parse(attr.Value) : 1; 26           colCount = colCount + colspan; 27         } 28         int rowCount = rows.Count; 29  30         string[][] arr = new string[rowCount][]; 31         for (int r = 0; r < rowCount; r++) 32         { 33           arr[r] = new string[colCount]; 34         } 35  36         //填充区域 37         for (int r = 0; r < rowCount; r++) 38         { 39           HtmlAgilityPack.HtmlNode tr = rows[r]; 40           List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); 41  42           int colspan = 0; 43           int rowspan = 0; 44           for (int c = 0; c < cols.Count; c++) 45           { 46             HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"]; 47             colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1; 48             HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"]; 49             rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1; 50             string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim(); 51  52             if (colspan == 1 && rowspan == 1) 53             { 54               continue; 55             } 56  57             bool isFirst = true; 58             int rFill = r + rowspan; 59             for (int ri = r; ri < rFill; ri++) 60             { 61               int cFill = c + colspan; 62               for (int ci = c; ci < cFill; ci++) 63               { 64                 if (isFirst) 65                 { 66                   text = (text == string.Empty) ? " " : text; 67                   arr[ri][ci] = text; 68                   isFirst = false; 69                 } 70                 else 71                 { 72                   arr[ri][ci] = string.Empty; 73                 } 74               } 75             } 76           } 77         } 78  79         //填充单元 80         for (int r = 0; r < rowCount; r++) 81         { 82           HtmlAgilityPack.HtmlNode tr = rows[r]; 83           List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); 84           Queue<string> queue = new Queue<string>(); 85           for (int c = 0; c < cols.Count; c++) 86           { 87             string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim(); 88             queue.Enqueue(text); 89           } 90           for (int c = 0; c < colCount; c++) 91           { 92             if (arr[r][c] == null) 93             { 94               string text = queue.Count > 0 ? queue.Dequeue() : string.Empty; 95               arr[r][c] = text; 96             } 97             else 98             { 99               if (arr[r][c] != string.Empty)100               {101                 if (queue.Count > 0)102                 {103                   queue.Dequeue();104                 }105               }106             }107           }108         }109 110         //组装成cvs格式内容111         foreach (string[] cols in arr)112         {113           foreach (string col in cols)114           {115             sbLines.Append(col + ",");116           }117           sbLines.AppendLine(",");118         }119         table.RemoveAll();120       }121     }122   }123 124   HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p");125   if (pList != null)126   {127     sbLines.AppendLine("#flag_text#,");128     foreach (HtmlAgilityPack.HtmlNode p in pList)129     {130       string text = p.InnerText.Replace("&nbsp;", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();131       text = GetTextByHtml(text);132       if (!string.IsNullOrWhiteSpace(text))133       {134         sbLines.Append(text + ",");135         sbLines.AppendLine(",");136       }137       else138       {139         sbLines.AppendLine(",");140       }141       p.RemoveAll();142     }143   }144 145   HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div");146   if (pList != null)147   {148     sbLines.AppendLine("#flag_text#,");149     foreach (HtmlAgilityPack.HtmlNode div in pList)150     {151       string text = div.InnerText.Replace("&nbsp;", "").Replace(",", ",").Replace("\r", "").Replace("\n", "").Trim();152       text = GetTextByHtml(text);153       if (!string.IsNullOrWhiteSpace(text))154       {155         sbLines.Append(text + ",");156         sbLines.AppendLine(",");157       }158       else159       {160         sbLines.AppendLine(",");161       }162       //div.RemoveAll();163     }164   }165   return sbLines.ToString();166 }

 

html: 

 

csv:

 

url:http://www.cnblogs.com/dreamman/p/5343924.html