If you come to this article, you probably need to extract some data from some XML files, and print the result to the standard output or redirect it to a pipe.

My little script, xpq.py, can extract some data from XML files and display the result in the standard output. It uses libxml2 to achieve that.

Here is how to use it:

python xpq.py xmlfile xpathquery [option]

or

command | python xpq.py xpathquery [option]
option:
content : get the content of the current node (default)
name : get the name of the current node
type : get the type of the current node
parent : get the parent node name of the current node
node : get the representation of the current node

You will find some examples below.

Consider the following simple XML file (test.xml):


<?xml version="1.0" encoding="utf-8">
<root>
  <element use="2">
    <subelement id="1">
      Sub-element text
    </subelement>
    <subelement id="2">
      Sub-element 2 text
    </subelement>
  </element>
</root>

Now have a look at the following commands and their output to see what the script do:

$ python xpq.py test.xml /root/element/@use content
2
$ cat test.xml | python xpq.py /root/element/@use content
2
$ python xpq.py test.xml /root/element/subelement[@id=2] content
Sub-element 2 text
$ python xpq.py test.xml /root/element/@use name
use
$ python xpq.py test.xml /root/element/@use type
attribute
$ python xpq.py test.xml /root/element/subelement[@id=1] node
<subelement id="1">
  Sub-element text
</subelement
$ python xpq.py test.xml /root/element/subelement[@id=1] parent
element

Here is the script (also available for download here, I would recommend it over copy-pasting as indentation is important in Python):

import sys
import libxml2
import os.path
import select

# Check if some data is in stdin, if yes, read its content to xmlbuffer
if select.select([sys.stdin,],[],[],0.0)[0]:
  frompipe = True
  xmlbuffer = sys.stdin.read()
else:
  frompipe = False

# Codes to print bold text and reset to normal text
bold = "\033[1m"
reset = "\033[0;0m"

# Return the first node of the result of a XPath query, None else.
def xpq(docxml, query):
  nodes = docxml.xpathEval(query)
  if len(nodes) > 0:
    return nodes[0]
    return None;

# Print script usage
def printusage(errorcode):
  print "Usage:", bold, "xpq xmlfile xpathquery", reset, "[option]"
  print "or:	command |", bold, "xpq xpathquery", reset, "[option] (Unix systems only)"
  print "option:"
  print "	", bold, "content", reset,"	: get the content of the current node (default)"
  print "	", bold, "name", reset,"		: get the name of the current node"
  print "	", bold, "type", reset,"		: get the type of the current node"
  print "	", bold, "parent", reset,"	: get the parent node name of the current node"
  print "	", bold, "node", reset,"		: get the representation of the current node"
  sys.exit(errorcode);

# Check for arguments
argc = len(sys.argv)
getwhat = "content"
exist = False
if frompipe == True:
  if argc >= 2:
    query = sys.argv[1]
    if argc == 3:
      if sys.argv[2] == "content":
        getwhat = "content"
      elif sys.argv[2] == "name":
        getwhat = "name"
      elif sys.argv[2] == "type":
        getwhat = "type"
      elif sys.argv[2] == "parent":
        getwhat = "parent"
      elif sys.argv[2] == "node":
        getwhat = "node"
      else:
        printusage(3)
    elif argc > 3:
      printusage(1)
  else:
    printusage(1)
else:
  if argc >= 3:
    if os.path.exists( sys.argv[1] ) == True:
      exist = True
      xmlfile = sys.argv[1]
    else:
      print "Input file not found!"
      printusage(2)
    query = sys.argv[2]
    if argc == 4:
      if sys.argv[3] == "content":
        getwhat = "content"
      elif sys.argv[3] == "name":
        getwhat = "name"
      elif sys.argv[3] == "type":
        getwhat = "type"
      elif sys.argv[3] == "parent":
        getwhat = "parent"
      elif sys.argv[3] == "node":
        getwhat = "node"
      else:
        printusage(3)
    elif argc > 4:
      printusage(1)
  else:
    printusage(1)

# Print the result of the query
if exist == True or frompipe == True:
  try:
    if exist == True:
      doc = libxml2.parseFile(xmlfile)
    if frompipe == True:
      doc = libxml2.parseDoc(xmlbuffer)
  except:
    print "Unable to load source file! Is it a valid XML file?"
    printusage(4)
  try:
    node = xpq(doc, query)
    if node == None:
      print "[Error] No node was found"
  except:
    print "Invalid XPath query!"
    doc.freeDoc()
    sys.exit(6)
  if node != None:
    if getwhat == "content":
      print node.content
    elif getwhat == "name":
      print node.name
    elif getwhat == "type":
      print node.type
    elif getwhat == "parent":
      print node.parent.name
    elif getwhat == "node":
      print node
  doc.freeDoc()
else:
  printusage(7)