Parse HTML
Library info: HtmlAgilityPack. NuGet: HtmlAgilityPack.
/*/ nuget -\HtmlAgilityPack; /*/
using HtmlAgilityPack;
print.clear();
Parse a HTML string or file.
var html = """
<html>
<body>
<p>Text</p>
<a id="example" href="https://www.example1.com">Link1</a>
<ul>
<li><a href="https://www.example2.com">Link2</a></li>
<li><a href="https://www.example3.com">Link3</a></li>
</ul>
</body>
</html>
""";
var doc1 = new HtmlDocument();
doc1.LoadHtml(html); //load from string
//doc1.Load(@"C:\Test\test.xml"); //load from file
var p = doc1.GetElementbyId("example");
print.it(p.OuterHtml);
print.it("All links:");
foreach (var link in doc1.DocumentNode.Descendants("a")) {
print.it(link.InnerText, link.GetAttributeValue("href", null));
}
print.it("Select elements using XPath:");
var a = doc1.DocumentNode.SelectNodes("//body/ul/li/a");
if (a != null) {
foreach (var link in a) {
print.it(link.InnerText, link.GetAttributeValue("href", null));
}
}
Download a web page. Get its title and text.
var web = new HtmlWeb();
var doc2 = web.Load("https://www.example.com");
var title = doc2.DocumentNode.SelectSingleNode("//head/title").InnerText;
print.it("Title:");
print.it(title);
var text = doc2.DocumentNode.SelectSingleNode("//body").InnerText;
print.it("Text:");
print.it(HtmlEntity.DeEntitize(text));
More info and examples in the HtmlAgilityPack website.
Get web page HTML from web browser window. Then get all links.
var w = wnd.find(1, "*- Google Chrome", "Chrome_WidgetWin_1");
var e = w.Elm["web:DOCUMENT"].Find(30);
//w.Elm["web:LINK", "Example"].Find(30); //wait until the web page is loaded and displays an element (link "Example")
var html2 = e.Html(true);
//print.it(html2);
var doc3 = new HtmlDocument();
doc3.LoadHtml(html2);
var body = doc3.DocumentNode;
foreach (var link in body.Descendants("a")) {
print.it(link.InnerText, link.GetAttributeValue("href", null));
}
See also recipe Web browser automation with Playwright.