Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Extracting data from web page (table etc); will HtmlDoc be included in LA?
#2
Quote:I was worried I will be kick off from the forum for asking too many

Your questions help in LA development. Testing LA with real tasks, etc.
 
Quote:Will HtmlDoc be included in LA?

Unlikely. Use other libraries, for example HtmlAgilityPack. Look in Cookbook -> Internet -> Parse HTML.

In this case can be used elm.

Code:
Copy      Help
// script "Opera arias.cs"
//https://www.opera-arias.com/arias/#x

print.clear();
var csv = new csvTable { ColumnCount = 9 };

var w = wnd.find(1, "Opera Arias *- Google Chrome", "Chrome_WidgetWin_1");

for (; ; ) {
    _Page();
    //break;
    var next = w.Elm["web:LINK", "Next"].Find(-1);
    if (next == null) break;
    next.WebInvoke();
}


print.it(csv);

void _Page() {
    var table = w.Elm["web:GROUPING", prop: "@id=table_div"].Find(1);
    
    //Some cells are empty, and there are no elms for empty cells, therefore cell indices become incorrect.
    //Solution: at first get column x offsets from the header row. Then can skip empty cells.

    var header = table.Navigate("pr");
    var ax = header.Elm["LINK"].FindAll().Select(o => o.Rect.CenterX).ToArray();
    
    for (var row = table.Navigate("fi"); row != null; row = row.Navigate("ne")) {
        var cells = new string[csv.ColumnCount];
        var cell = row.Navigate("fi ne");
        for (int i = 0; i < csv.ColumnCount; i++) {
            if (i > 0) { cell = cell.Navigate("ne"); if (cell == null) break; }
            
            //correct column index for empty cells
            for (int x = cell.Rect.left; x > ax[i] && x != 0;) i++;
            
            var s = i switch { 1 => cell.HtmlAttribute("style")[6..^2], 6 => cell.Navigate("fi").Name, _ => cell.Name };
            
            if (i == csv.ColumnCount - 1) { //the last column. Some cells consist of multiple elements.
                while ((cell = cell.Navigate("ne")) != null) s += cell.Name;
            }

            
            cells[i] = s;
        }

        csv.AddRow(cells);
    }
}


Messages In This Thread
RE: Will HtmlDoc be included in LA? - by Gintaras - 05-14-2023, 07:34 AM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-14-2023, 12:10 PM
RE: Will HtmlDoc be included in LA? - by Gintaras - 05-14-2023, 12:31 PM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-14-2023, 12:44 PM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-20-2023, 02:58 PM
RE: Will HtmlDoc be included in LA? - by Gintaras - 05-20-2023, 04:07 PM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-20-2023, 04:27 PM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-27-2023, 06:18 AM
RE: Will HtmlDoc be included in LA? - by Gintaras - 05-27-2023, 07:16 AM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-27-2023, 07:23 AM
RE: Will HtmlDoc be included in LA? - by birdywen - 05-31-2023, 03:13 AM
RE: Will HtmlDoc be included in LA? - by Gintaras - 05-31-2023, 04:05 AM
RE: Will HtmlDoc be included in LA? - by Gintaras - 05-31-2023, 04:25 AM

Forum Jump:


Users browsing this thread: 1 Guest(s)