hwdb: PNP/ACPI lists on uefi.org are now in CSV format

Adjust the parsing as it's no longer HTML files. Some IDs end with
whitespace, without being quoted, which seems like a mistake as they
weren't before, so strip the ID columns before applying them.
This commit is contained in:
Luca Boccassi 2023-11-14 20:46:12 +00:00
parent 094d85a6bb
commit 9a86f08412

View file

@ -1,74 +1,26 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: LGPL-2.1-or-later
from html.parser import HTMLParser
from csv import reader
from enum import Enum
class State(Enum):
NOWHERE = 0
COMPANY = 1
AFTER_COMPANY = 2
PNPID = 3
AFTER_PNPID = 4
DATE = 5
class PNPTableParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.state = State.NOWHERE
self.data = ""
self.pnpid = None
self.company = None
self.table = []
def handle_starttag(self, tag, attrs):
if tag == "td":
if self.state == State.NOWHERE:
self.state = State.COMPANY
elif self.state == State.AFTER_COMPANY:
self.state = State.PNPID
elif self.state == State.AFTER_PNPID:
self.state = State.DATE
else:
raise ValueError
self.data = ""
def handle_endtag(self, tag):
if tag == "td":
if self.state == State.COMPANY:
self.company = ' '.join(self.data.strip().split())
self.state = State.AFTER_COMPANY
elif self.state == State.PNPID:
self.pnpid = self.data.strip()
self.state = State.AFTER_PNPID
self.table.append((self.pnpid, self.company))
elif self.state == State.DATE:
self.state = State.NOWHERE
else:
raise ValueError
def handle_data(self, data):
self.data += data
def read_table(a):
parser = PNPTableParser()
table = []
for line in a:
parser.feed(line)
with open(a, newline='') as csvfile:
for row in reader(csvfile):
if row[0] == "Company":
# Skip header
continue
table.append(row)
parser.close()
parser.table.sort()
table.sort(key=lambda x: x[1])
for pnpid, company in parser.table:
print("\nacpi:{0}*:\n ID_VENDOR_FROM_DATABASE={1}".format(pnpid, company))
a = open("acpi_id_registry.html")
b = open("pnp_id_registry.html")
for row in table:
# Some IDs end with whitespace, while they didn't in the old HTML table, so it's probably
# a mistake, strip it.
print("\nacpi:{0}*:\n ID_VENDOR_FROM_DATABASE={1}".format(row[1].strip(), row[0].strip()))
print('# This file is part of systemd.\n'
'#\n'
@ -76,5 +28,5 @@ print('# This file is part of systemd.\n'
'# https://uefi.org/uefi-pnp-export\n'
'# https://uefi.org/uefi-acpi-export')
read_table(a)
read_table(b)
read_table("acpi_id_registry.html")
read_table("pnp_id_registry.html")