java - Fast replacement of XML node values -
i have bunch of xml documents contain personal information need replace fake data. person node contains following elements:
- uuid - required, should not touched.
- firstname - optional
- lastname - optional
- address - optional
- personid - required
a person may appear many times, in case same fake data should used, i.e. if 2 person nodes have same personid, should both recieve same fake id.
i have implemented java code builds dom tree xml string , replaces nodes before writing string. works fine, since have many documents wondering if there faster approach. maybe through regular expressions or xslt or something?
here example document:
<adocument> <stuff> ... </stuff> <otherstuff> ... </otherstuff> <person> <uuid>11111111-1111-1111-1111-111111111111</uuid> <firstname>some</firstname> <lastname>person</lastname> <personid>111111111111</personid> </person> <person> <uuid>22222222-2222-2222-2222-222222222222</uuid> <firstname>another person</firstname> <address>main st. 2</address> <personid>222222222222</personid> </person> <person> <uuid>33333333-3333-3333-3333-333333333333</uuid> <firstname>some</firstname> <lastname>person</lastname> <personid>111111111111</personid> </person> <morestuff> ... </morestuff> </adocument>
and current implementation:
public string replacewithfalsedata(string xmlinstance) { document dom = todom(xmlinstance); xpathexpression xpathexpression = xpathexpressionfactory.createxpathexpression("//person"); list<node> nodelist = xpathexpression.evaluateasnodelist(dom); for(node personnode : nodelist) { map<string, node> childnodes = getchildnodes(personnode); string personid = childnodes.get("personid").gettextcontent(); // retrieve cached fake person using id, or create new 1 if none exists. person fakeperson = getfakeperson(personid); setifexists(childnodes.get("firstname"), fakeperson.getfirstname()); setifexists(childnodes.get("lastname"), fakeperson.getlastname()); setifexists(childnodes.get("address"), fakeperson.getaddress()); setifexists(childnodes.get("personid"), fakeperson.getpersonid()); } return tostring(dom); } public map<string, node> getchildnodes(node parent) { map<string, node> childnodes = new hashmap<string, node>(); for(node child = parent.getfirstchild(); child != null; child = child.getnextsibling()) { if(child.getlocalname() != null) { childnodes.put(child.getlocalname(), child); } } return childnodes; } public void setifexists(node node, string value) { if(node != null) { node.settextcontent(value); } }
you using dom based api. faster replacement can achieved streaming api xml (stax) in many cases, can outperform dom-based api: stax versus dom
dom api occupies more memory stax, can degrade performance, easier use stax api.
working solution example - tested on 150 mb xml file, replaced in 10 sec:
import java.io.bufferedinputstream; import java.io.bufferedoutputstream; import java.io.fileinputstream; import java.io.filenotfoundexception; import java.io.fileoutputstream; import java.io.filewriter; import java.io.ioexception; import java.util.arraylist; import java.util.iterator; import java.util.list; import javax.xml.stream.xmleventfactory; import javax.xml.stream.xmleventreader; import javax.xml.stream.xmleventwriter; import javax.xml.stream.xmlinputfactory; import javax.xml.stream.xmloutputfactory; import javax.xml.stream.xmlstreamexception; import javax.xml.stream.events.xmlevent; public class replacexmlwithfakeuser { public static void main(string[] args) throws xmlstreamexception, ioexception { xmlinputfactory infactory = xmlinputfactory.newinstance(); xmleventreader eventreader = infactory.createxmleventreader(new bufferedinputstream(new fileinputstream("c:\\temp\\persons.xml"))); xmloutputfactory factory = xmloutputfactory.newinstance(); xmleventwriter writer = factory.createxmleventwriter(new bufferedoutputstream(new fileoutputstream("c:\\temp\\fakepersons.xml"))); xmleventfactory eventfactory = xmleventfactory.newinstance(); while (eventreader.hasnext()) { xmlevent event = eventreader.nextevent(); if (event.geteventtype() == xmlevent.start_element && event.asstartelement().getname().tostring().equals("person")) { //write person startelement: writer.add(event); /* step 1: personid @ end of person element. cannot overwrite firstname , address element fake data yet. must call getfakeperson() first. iterate till read person end element , remember events within person element overwrite fake data in step 2. */ person fakeperson=null; list<xmlevent> eventswithinpersonelement = new arraylist<xmlevent>(); event = eventreader.nextevent(); while(!(event.geteventtype() == xmlevent.end_element && event.asendelement().getname().tostring().equals("person"))) { eventswithinpersonelement.add(event); if(event.geteventtype() == xmlevent.start_element && event.asstartelement().getname().tostring().equals("personid")) { xmlevent personidcontentevent = eventreader.nextevent(); string personid = personidcontentevent.ascharacters().tostring(); fakeperson = getfakeperson(personid); eventswithinpersonelement.add(personidcontentevent); } event = eventreader.nextevent(); } xmlevent personendelement=event; //step 2: (iterator<xmlevent> eventwithinpersonelementiterator = eventswithinpersonelement.iterator(); eventwithinpersonelementiterator.hasnext(); ) { xmlevent eventwithinpersonelement = eventwithinpersonelementiterator.next(); writer.add(eventwithinpersonelement); if(eventwithinpersonelement.geteventtype() == xmlevent.start_element && eventwithinpersonelement.asstartelement().getname().tostring().equals("personid")) { writer.add(eventfactory.createcharacters(fakeperson.personid)); //skip personid event eventwithinpersonelementiterator.next(); } if(eventwithinpersonelement.geteventtype() == xmlevent.start_element && eventwithinpersonelement.asstartelement().getname().tostring().equals("firstname")) { writer.add(eventfactory.createcharacters(fakeperson.firstname)); //skip real firstname eventwithinpersonelementiterator.next(); } if(eventwithinpersonelement.geteventtype() == xmlevent.start_element && eventwithinpersonelement.asstartelement().getname().tostring().equals("lastname")) { writer.add(eventfactory.createcharacters(fakeperson.lastname)); //skip real firstname eventwithinpersonelementiterator.next(); } else if(eventwithinpersonelement.geteventtype() == xmlevent.start_element && eventwithinpersonelement.asstartelement().getname().tostring().equals("address")) { writer.add(eventfactory.createcharacters(fakeperson.address)); //skip real address eventwithinpersonelementiterator.next(); } } writer.add(personendelement); } else { writer.add(event); } } writer.close(); } private static person getfakeperson(string personid) { //create simple fake user... person fakeperson = new person(); fakeperson.personid = personid; fakeperson.firstname = "fake first name: " + math.random(); fakeperson.lastname = "fake last name: " + math.random(); fakeperson.address = "fake address: " + math.random(); return fakeperson; } static class person { string personid; string firstname; string lastname; string address; } }
use persons.xml
input:
<adocument> <stuff> <stuffa></stuffa> </stuff> <otherstuff> <otherstuff> <abc>yada yada</abc> </otherstuff> </otherstuff> <person> <uuid>11111111-1111-1111-1111-111111111111</uuid> <firstname>some</firstname> <lastname>person</lastname> <personid>111111111111</personid> </person> <person> <uuid>22222222-2222-2222-2222-222222222222</uuid> <firstname>another person</firstname> <address>main st. 2</address> <personid>222222222222</personid> </person> <person> <uuid>33333333-3333-3333-3333-333333333333</uuid> <firstname>some</firstname> <lastname>person</lastname> <personid>111111111111</personid> </person> <morestuff> <foo></foo> <foo>fooo</foo> <foo><bar></bar></foo> <foo> <bar></bar> <bar/> <bar>bb</bar> </foo> <bar/> </morestuff> </adocument>
producing fakepersons.xml
result:
<?xml version="1.0" encoding="utf-8"?><adocument> <stuff> <stuffa></stuffa> </stuff> <otherstuff> <otherstuff> <abc>yada yada</abc> </otherstuff> </otherstuff> <person> <uuid>11111111-1111-1111-1111-111111111111</uuid> <firstname>fake first name: 0.9518514637129984</firstname> <lastname>fake last name: 0.3495378044884426</lastname> <personid>111111111111</personid> </person> <person> <uuid>22222222-2222-2222-2222-222222222222</uuid> <firstname>fake first name: 0.8945739434355868</firstname> <address>fake address: 0.40784763231471777</address> <personid>222222222222</personid> </person> <person> <uuid>33333333-3333-3333-3333-333333333333</uuid> <firstname>fake first name: 0.7863207851479257</firstname> <lastname>fake last name: 0.09918620445731652</lastname> <personid>111111111111</personid> </person> <morestuff> <foo></foo> <foo>fooo</foo> <foo><bar></bar></foo> <foo> <bar></bar> <bar></bar> <bar>bb</bar> </foo> <bar></bar> </morestuff> </adocument>
Comments
Post a Comment