You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

43 lines
1.1 KiB

1 year ago
1 year ago
1 year ago
1 year ago
1 year ago
  1. # from lxml import etree
  2. # # 解析HTML
  3. # html = """
  4. # <div>
  5. # 123
  6. # <ul class="list">
  7. # <li class="item-0">first item</li>
  8. # <li class="item-1"><a href="link2.html">second item</a></li>
  9. # </ul>
  10. # 456
  11. # <div></div>
  12. # 789
  13. # </div>
  14. # """
  15. # html = etree.HTML(html)
  16. # element = html.xpath("*")
  17. # direct_text = "/html/body/" + html[0][0].tag + "/text()"
  18. # all_text = "/html/body/" + html[0][0].tag + "//text()"
  19. # # 使用XPath选择元素
  20. # results = html.xpath(direct_text)
  21. # # print(results)
  22. # # 拼接所有文本内容并去掉两边的空白
  23. # text = ' '.join(result.strip() for result in results if result.strip())
  24. # # 输出结果
  25. # print(text)
  26. # results = html.xpath(all_text)
  27. # # print(results)
  28. # # 拼接所有文本内容并去掉两边的空白
  29. # text = ' '.join(result.strip() for result in results if result.strip())
  30. # # 输出结果
  31. # print(text)
  32. import re
  33. def lowercase_xpath_tags(xpath):
  34. return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
  35. print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL'))
  36. print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]")
  37. print("")