If you come to this article, you probably need to extract some data from some XML files, and print the result to the standard output or redirect it to a pipe.
My little script, xpq.py, can extract some data from XML files and display the result in the standard output. It uses libxml2 to achieve that.
Here is how to use it:
python xpq.py xmlfile xpathquery [option]
or
command | python xpq.py xpathquery [option]
option:
content : get the content of the current node (default)
name : get the name of the current node
type : get the type of the current node
parent : get the parent node name of the current node
node : get the representation of the current node
You will find some examples below.
Consider the following simple XML file (test.xml):
01 02 03 04 05 06 07 08 09 10 11 | <? xml version = "1.0" encoding = "utf-8" > < root > < element use = "2" > < subelement id = "1" > Sub-element text </ subelement > < subelement id = "2" > Sub-element 2 text </ subelement > </ element > </ root > |
Now have a look at the following commands and their output to see what the script do:
01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 | $ python xpq.py test .xml /root/element/ @use content 2 $ cat test .xml | python xpq.py /root/element/ @use content 2 $ python xpq.py test .xml /root/element/subelement [@ id =2] content Sub-element 2 text $ python xpq.py test .xml /root/element/ @use name use $ python xpq.py test .xml /root/element/ @use type attribute $ python xpq.py test .xml /root/element/subelement [@ id =1] node <subelement id = "1" > Sub-element text < /subelement $ python xpq.py test .xml /root/element/subelement [@ id =1] parent element |
Here is the script (also available for download here, I would recommend it over copy-pasting as indentation is important in Python):
001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | import sys import libxml2 import os.path import select # Check if some data is in stdin, if yes, read its content to xmlbuffer if select.select([sys.stdin,],[],[], 0.0 )[ 0 ]: frompipe = True xmlbuffer = sys.stdin.read() else : frompipe = False # Codes to print bold text and reset to normal text bold = "\033[1m" reset = "\033[0;0m" # Return the first node of the result of a XPath query, None else. def xpq(docxml, query): nodes = docxml.xpathEval(query) if len (nodes) > 0 : return nodes[ 0 ] return None ; # Print script usage def printusage(errorcode): print "Usage:" , bold, "xpq xmlfile xpathquery" , reset, "[option]" print "or: command |" , bold, "xpq xpathquery" , reset, "[option] (Unix systems only)" print "option:" print " " , bold, "content" , reset, " : get the content of the current node (default)" print " " , bold, "name" , reset, " : get the name of the current node" print " " , bold, "type" , reset, " : get the type of the current node" print " " , bold, "parent" , reset, " : get the parent node name of the current node" print " " , bold, "node" , reset, " : get the representation of the current node" sys.exit(errorcode); # Check for arguments argc = len (sys.argv) getwhat = "content" exist = False if frompipe = = True : if argc > = 2 : query = sys.argv[ 1 ] if argc = = 3 : if sys.argv[ 2 ] = = "content" : getwhat = "content" elif sys.argv[ 2 ] = = "name" : getwhat = "name" elif sys.argv[ 2 ] = = "type" : getwhat = "type" elif sys.argv[ 2 ] = = "parent" : getwhat = "parent" elif sys.argv[ 2 ] = = "node" : getwhat = "node" else : printusage( 3 ) elif argc > 3 : printusage( 1 ) else : printusage( 1 ) else : if argc > = 3 : if os.path.exists( sys.argv[ 1 ] ) = = True : exist = True xmlfile = sys.argv[ 1 ] else : print "Input file not found!" printusage( 2 ) query = sys.argv[ 2 ] if argc = = 4 : if sys.argv[ 3 ] = = "content" : getwhat = "content" elif sys.argv[ 3 ] = = "name" : getwhat = "name" elif sys.argv[ 3 ] = = "type" : getwhat = "type" elif sys.argv[ 3 ] = = "parent" : getwhat = "parent" elif sys.argv[ 3 ] = = "node" : getwhat = "node" else : printusage( 3 ) elif argc > 4 : printusage( 1 ) else : printusage( 1 ) # Print the result of the query if exist = = True or frompipe = = True : try : if exist = = True : doc = libxml2.parseFile(xmlfile) if frompipe = = True : doc = libxml2.parseDoc(xmlbuffer) except : print "Unable to load source file! Is it a valid XML file?" printusage( 4 ) try : node = xpq(doc, query) if node = = None : print "[Error] No node was found" except : print "Invalid XPath query!" doc.freeDoc() sys.exit( 6 ) if node ! = None : if getwhat = = "content" : print node.content elif getwhat = = "name" : print node.name elif getwhat = = "type" : print node. type elif getwhat = = "parent" : print node.parent.name elif getwhat = = "node" : print node doc.freeDoc() else : printusage( 7 ) |
Recent Comments