XML ElementTree
https://towardsdatascience.com/processing-xml-in-python-elementtree-c8992941efd2 : Processing XML in Python — ElementTree by Deepesh Nair
# XML (eXtensible Markup Language # elements #text import xml.etree.ElementTree as ET ### Parsing tree = ET.parse('./movie.xml') root = tree.getroot() print(f'tag=>{root.tag} attrib=>{root.attrib} text=>{root.text}') ### parse with loop for child in root: print(f'tag=>{child.tag} attrib=>{child.attrib} text=>{child.text}') [elem.tag for elem in root.iter()] [elem.attrib for elem in root.iter()] [elem.text for elem in root.iter()] for rating in root.iter('rating'): print(f'tag=>{rating.tag} attrib=>{rating.attrib} text=>{rating.text}') ### Scan with XPath #- Xpath with tag for movie in root.findall("./genre/decade/movie/[year='2000']"): print(f'tag=>{movie.tag} attrib=>{movie.attrib} text=>{movie.text}') #- Xtpah with attrib for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']"): print(f'tag=>{movie.tag} attrib=>{movie.attrib} text=>{movie.text}') # ... parent element for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']..."): print(f'tag=>{movie.tag} attrib=>{movie.attrib} text=>{movie.text}') #### Modify and Save b2tf = root.find("./genre/decade/movie[@title='Back 2 the Future']") print(b2tf) b2tf.attrib["title"] = 'Back Back Back' tree.write("movies.xml") # search with regex import re for form in root.findall("./genre/decade/movie/format"): print(re.search(',', form.text)) # add (modify) attribute with set() import re for form in root.findall("./genre/decade/movie/format"): match = re.search(',', form.text) if match: form.set('multiple', 'yes') else: form.set('multiple', 'No') for form in root.findall("./genre/decade/movie/format"): print(form.attrib, form.text) # find wrong data for decade in root.findall("./genre/decade"): print(decade.attrib) for year in decade.findall("./movie/year"): print(year.text) for movie in root.findall("./genre/decade/movie/[year='2000']"): print(movie.attrib) action = root.find("./genre[@category='Action']") new_dec = ET.SubElement(action, 'decade') new_dec.attrib["years"] = '2000s' xmen = root.find("./genre/decade/movie[@title='X-Men']") dec2000s = root.find("./genre[@category='Action']/decade[@years='2000s']") dec2000s.append(xmen) dec1990s = root.find("./genre[@category='Action']/decade[@years='1990s']") dec1990s.remove(xmen) # build the new xml tree.write("movies.xml") tree = ET.parse('movies.xml') root = tree.getroot() print(ET.tostring(root, encoding='utf8').decode('utf8'))
DVD 1981 PG 'Archaeologist and adventurer Indiana Jones is hired by the U.S. government to find the Ark of the Covenant before the Nazis.' DVD,Online 1984 PG None provided. Blu-ray 1985 PG Marty McFly dvd, digital 2000 PG-13 Two mutants come to a private academy for their kind whose resident superhero team must oppose a terrorist organization with similar powers. VHS 1992 PG13 NA. Online 1992 R WhAtEvER I Want!!!?! DVD 1979 R """"""""" DVD 1986 PG13 Funny movie on funny guy blue-ray 2000 Unrated psychopathic Bateman
BeautifulSoup .prettify()
BeautifulSoup may be the simplest solution for Python <=3.9.
https://stackoverflow.com/questions/749796/pretty-printing-xml-in-python
xmlFile= '~/mmdetection/oxford/annotations/xmls/Abyssinian_160.xml' from bs4 import BeautifulSoup bs = BeautifulSoup(open(xmlFile), 'xml') pretty_xml = bs.prettify() print(pretty_xml)
lxml
Prettier output but with arguments.
from lxml import etree x = etree.parse(xmlFile) pretty_xml = etree.tostring(x, pretty_print=True, encoding=str) print(pretty_xml)