xpq.py: a python script to query XML files using XPath queries

If you come to this article, you probably need to extract some data from some XML files, and print the result to the standard output or redirect it to a pipe.

My little script, xpq.py, can extract some data from XML files and display the result in the standard output. It uses libxml2 to achieve that.

Here is how to use it:

python xpq.py xmlfile xpathquery [option]

or

command | python xpq.py xpathquery [option]
option:
content : get the content of the current node (default)
name : get the name of the current node
type : get the type of the current node
parent : get the parent node name of the current node
node : get the representation of the current node

You will find some examples below.

Consider the following simple XML file (test.xml):


<?xml version="1.0" encoding="utf-8">
<root>
  <element use="2">
    <subelement id="1">
      Sub-element text
    </subelement>
    <subelement id="2">
      Sub-element 2 text
    </subelement>
  </element>
</root>

Now have a look at the following commands and their output to see what the script do:

$ python xpq.py test.xml /root/element/@use content
2
$ cat test.xml | python xpq.py /root/element/@use content
2
$ python xpq.py test.xml /root/element/subelement[@id=2] content
Sub-element 2 text
$ python xpq.py test.xml /root/element/@use name
use
$ python xpq.py test.xml /root/element/@use type
attribute
$ python xpq.py test.xml /root/element/subelement[@id=1] node
<subelement id="1">
  Sub-element text
</subelement
$ python xpq.py test.xml /root/element/subelement[@id=1] parent
element

Here is the script (also available for download here, I would recommend it over copy-pasting as indentation is important in Python):

import sys
import libxml2
import os.path
import select

# Check if some data is in stdin, if yes, read its content to xmlbuffer
if select.select([sys.stdin,],[],[],0.0)[0]:
  frompipe = True
  xmlbuffer = sys.stdin.read()
else:
  frompipe = False

# Codes to print bold text and reset to normal text
bold = "\033[1m"
reset = "\033[0;0m"

# Return the first node of the result of a XPath query, None else.
def xpq(docxml, query):
  nodes = docxml.xpathEval(query)
  if len(nodes) > 0:
    return nodes[0]
    return None;

# Print script usage
def printusage(errorcode):
  print "Usage:", bold, "xpq xmlfile xpathquery", reset, "[option]"
  print "or: command |", bold, "xpq xpathquery", reset, "[option] (Unix systems only)"
  print "option:"
  print " ", bold, "content", reset," : get the content of the current node (default)"
  print " ", bold, "name", reset,"  : get the name of the current node"
  print " ", bold, "type", reset,"  : get the type of the current node"
  print " ", bold, "parent", reset," : get the parent node name of the current node"
  print " ", bold, "node", reset,"  : get the representation of the current node"
  sys.exit(errorcode);

# Check for arguments
argc = len(sys.argv)
getwhat = "content"
exist = False
if frompipe == True:
  if argc >= 2:
    query = sys.argv[1]
    if argc == 3:
      if sys.argv[2] == "content":
        getwhat = "content"
      elif sys.argv[2] == "name":
        getwhat = "name"
      elif sys.argv[2] == "type":
        getwhat = "type"
      elif sys.argv[2] == "parent":
        getwhat = "parent"
      elif sys.argv[2] == "node":
        getwhat = "node"
      else:
        printusage(3)
    elif argc > 3:
      printusage(1)
  else:
    printusage(1)
else:
  if argc >= 3:
    if os.path.exists( sys.argv[1] ) == True:
      exist = True
      xmlfile = sys.argv[1]
    else:
      print "Input file not found!"
      printusage(2)
    query = sys.argv[2]
    if argc == 4:
      if sys.argv[3] == "content":
        getwhat = "content"
      elif sys.argv[3] == "name":
        getwhat = "name"
      elif sys.argv[3] == "type":
        getwhat = "type"
      elif sys.argv[3] == "parent":
        getwhat = "parent"
      elif sys.argv[3] == "node":
        getwhat = "node"
      else:
        printusage(3)
    elif argc > 4:
      printusage(1)
  else:
    printusage(1)

# Print the result of the query
if exist == True or frompipe == True:
  try:
    if exist == True:
      doc = libxml2.parseFile(xmlfile)
    if frompipe == True:
      doc = libxml2.parseDoc(xmlbuffer)
  except:
    print "Unable to load source file! Is it a valid XML file?"
    printusage(4)
  try:
    node = xpq(doc, query)
    if node == None:
      print "[Error] No node was found"
  except:
    print "Invalid XPath query!"
    doc.freeDoc()
    sys.exit(6)
  if node != None:
    if getwhat == "content":
      print node.content
    elif getwhat == "name":
      print node.name
    elif getwhat == "type":
      print node.type
    elif getwhat == "parent":
      print node.parent.name
    elif getwhat == "node":
      print node
  doc.freeDoc()
else:
  printusage(7)
Related Post