xpq.py: a python script to query XML files using XPath queries

xpq.py: a python script to query XML files using XPath queries

If you come to this article, you probably need to extract some data from some XML files, and print the result to the standard output or redirect it to a pipe.

My little script, xpq.py, can extract some data from XML files and display the result in the standard output. It uses libxml2 to achieve that.

Here is how to use it:

python xpq.py xmlfile xpathquery [option]

or

command | python xpq.py xpathquery [option]
option:
content : get the content of the current node (default)
name : get the name of the current node
type : get the type of the current node
parent : get the parent node name of the current node
node : get the representation of the current node

You will find some examples below.

Consider the following simple XML file (test.xml):

01
02
03
04
05
06
07
08
09
10
11
<?xml version="1.0" encoding="utf-8">
<root>
  <element use="2">
    <subelement id="1">
      Sub-element text
    </subelement>
    <subelement id="2">
      Sub-element 2 text
    </subelement>
  </element>
</root>

Now have a look at the following commands and their output to see what the script do:

01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
$ python xpq.py test.xml /root/element/@use content
2
$ cat test.xml | python xpq.py /root/element/@use content
2
$ python xpq.py test.xml /root/element/subelement[@id=2] content
Sub-element 2 text
$ python xpq.py test.xml /root/element/@use name
use
$ python xpq.py test.xml /root/element/@use type
attribute
$ python xpq.py test.xml /root/element/subelement[@id=1] node
<subelement id="1">
  Sub-element text
</subelement
$ python xpq.py test.xml /root/element/subelement[@id=1] parent
element

Here is the script (also available for download here, I would recommend it over copy-pasting as indentation is important in Python):

001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import sys
import libxml2
import os.path
import select
 
# Check if some data is in stdin, if yes, read its content to xmlbuffer
if select.select([sys.stdin,],[],[],0.0)[0]:
  frompipe = True
  xmlbuffer = sys.stdin.read()
else:
  frompipe = False
 
# Codes to print bold text and reset to normal text
bold = "\033[1m"
reset = "\033[0;0m"
 
# Return the first node of the result of a XPath query, None else.
def xpq(docxml, query):
  nodes = docxml.xpathEval(query)
  if len(nodes) > 0:
    return nodes[0]
    return None;
 
# Print script usage
def printusage(errorcode):
  print "Usage:", bold, "xpq xmlfile xpathquery", reset, "[option]"
  print "or:    command |", bold, "xpq xpathquery", reset, "[option] (Unix systems only)"
  print "option:"
  print "    ", bold, "content", reset,"    : get the content of the current node (default)"
  print "    ", bold, "name", reset,"        : get the name of the current node"
  print "    ", bold, "type", reset,"        : get the type of the current node"
  print "    ", bold, "parent", reset,"    : get the parent node name of the current node"
  print "    ", bold, "node", reset,"        : get the representation of the current node"
  sys.exit(errorcode);
 
# Check for arguments
argc = len(sys.argv)
getwhat = "content"
exist = False
if frompipe == True:
  if argc >= 2:
    query = sys.argv[1]
    if argc == 3:
      if sys.argv[2] == "content":
        getwhat = "content"
      elif sys.argv[2] == "name":
        getwhat = "name"
      elif sys.argv[2] == "type":
        getwhat = "type"
      elif sys.argv[2] == "parent":
        getwhat = "parent"
      elif sys.argv[2] == "node":
        getwhat = "node"
      else:
        printusage(3)
    elif argc > 3:
      printusage(1)
  else:
    printusage(1)
else:
  if argc >= 3:
    if os.path.exists( sys.argv[1] ) == True:
      exist = True
      xmlfile = sys.argv[1]
    else:
      print "Input file not found!"
      printusage(2)
    query = sys.argv[2]
    if argc == 4:
      if sys.argv[3] == "content":
        getwhat = "content"
      elif sys.argv[3] == "name":
        getwhat = "name"
      elif sys.argv[3] == "type":
        getwhat = "type"
      elif sys.argv[3] == "parent":
        getwhat = "parent"
      elif sys.argv[3] == "node":
        getwhat = "node"
      else:
        printusage(3)
    elif argc > 4:
      printusage(1)
  else:
    printusage(1)
 
# Print the result of the query
if exist == True or frompipe == True:
  try:
    if exist == True:
      doc = libxml2.parseFile(xmlfile)
    if frompipe == True:
      doc = libxml2.parseDoc(xmlbuffer)
  except:
    print "Unable to load source file! Is it a valid XML file?"
    printusage(4)
  try:
    node = xpq(doc, query)
    if node == None:
      print "[Error] No node was found"
  except:
    print "Invalid XPath query!"
    doc.freeDoc()
    sys.exit(6)
  if node != None:
    if getwhat == "content":
      print node.content
    elif getwhat == "name":
      print node.name
    elif getwhat == "type":
      print node.type
    elif getwhat == "parent":
      print node.parent.name
    elif getwhat == "node":
      print node
  doc.freeDoc()
else:
  printusage(7)

Leave a Reply

Your email address will not be published. Required fields are marked *

*

This site uses Akismet to reduce spam. Learn how your comment data is processed.