佳音的博客

2007/08/10

root用户相同权限的用户

Filed under: Uncategorized — 佳音 @ 11:12 上午

useradd -u 0 -o -g 0 username 使用-o参数可以允许建立相同id的用户

]]>

2007/08/08

lucene highlighter的使用

Filed under: Uncategorized — 佳音 @ 7:22 下午

package org.apache.lucene.search.highlight;

/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */

import java.io.ByteArrayInputStream;import java.io.IOException;import java.io.Reader;import java.io.StringReader;import java.util.*;

import javax.xml.parsers.DocumentBuilder;import javax.xml.parsers.DocumentBuilderFactory;

import junit.framework.TestCase;

import org.apache.lucene.analysis.*;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.FilteredQuery;import org.apache.lucene.search.Hits;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.MultiSearcher;import org.apache.lucene.search.PhraseQuery;import org.apache.lucene.search.Query;import org.apache.lucene.search.RangeFilter;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.spans.SpanNearQuery;import org.apache.lucene.search.spans.SpanQuery;import org.apache.lucene.search.spans.SpanTermQuery;import org.apache.lucene.store.RAMDirectory;import org.w3c.dom.Element;import org.w3c.dom.NodeList;

/** * JUnit Test for Highlighter class. * @author mark@searcharea.co.uk */public class HighlighterTest extends TestCase implements Formatter{ private IndexReader reader; private static final String FIELD_NAME = "contents"; private Query query; RAMDirectory ramDir; public Searcher searcher = null; public Hits hits = null; int numHighlights = 0; Analyzer analyzer=new StandardAnalyzer();

 String texts[] =  {   "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",   "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",   "JFK has been shot",   "John Kennedy has been shot",   "This text has a typo in referring to Keneddy" };

 /**  * Constructor for HighlightExtractorTest.  * @param arg0  */ public HighlighterTest(String arg0) {  super(arg0); }

 public void testSimpleHighlighter() throws Exception {  doSearching("Kennedy");  Highlighter highlighter = new Highlighter(new QueryScorer(query));  highlighter.setTextFragmenter(new SimpleFragmenter(40));  int maxNumFragmentsRequired = 2;  for (int i = 0; i < hits.length(); i++)  {   String text = hits.doc(i).get(FIELD_NAME);   TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

   String result =    highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");   System.out.println("\t" + result);  }  //Not sure we can assert anything here - just running to check we dont throw any exceptions }

 public void testGetBestFragmentsSimpleQuery() throws Exception {  doSearching("Kennedy");  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } public void testGetFuzzyFragments() throws Exception {  doSearching("Kinnedy~");  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); }

 public void testGetWildCardFragments() throws Exception {  doSearching("K?nnedy");  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } public void testGetMidWildCardFragments() throws Exception {  doSearching("K*dy");  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } public void testGetRangeFragments() throws Exception {  String queryString=FIELD_NAME + ":[kannedy TO kznnedy]"; 

  //Need to explicitly set the QueryParser property to use RangeQuery rather than RangeFilters  QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer());  parser.setUseOldRangeQuery(true);  query = parser.parse(queryString);  doSearching(query);

  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); }

 public void testGetBestFragmentsPhrase() throws Exception {  doSearching("\"John Kennedy\"");  doStandardHighlights();  //Currently highlights "John" and "Kennedy" separately  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } public void testGetBestFragmentsSpan() throws Exception {  SpanQuery clauses[]={   new SpanTermQuery(new Term("contents","john")),   new SpanTermQuery(new Term("contents","kennedy")),   }; 

  SpanNearQuery snq=new SpanNearQuery(clauses,1,true);  doSearching(snq);  doStandardHighlights();  //Currently highlights "John" and "Kennedy" separately  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); }

 public void testOffByOne() throws IOException  {     TermQuery query= new TermQuery( new Term( "data", "help" ));     Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer( query ));     hg.setTextFragmenter( new NullFragmenter() );

     String match = null;     match = hg.getBestFragment( new StandardAnalyzer(), "data", "help me [54-65]");     assertEquals("help me [54-65]", match); }   public void testGetBestFragmentsFilteredQuery() throws Exception {  RangeFilter rf=new RangeFilter("contents","john","john",true,true);  SpanQuery clauses[]={    new SpanTermQuery(new Term("contents","john")),    new SpanTermQuery(new Term("contents","kennedy")),    };   SpanNearQuery snq=new SpanNearQuery(clauses,1,true);  FilteredQuery fq=new FilteredQuery(snq,rf);

  doSearching(fq);  doStandardHighlights();  //Currently highlights "John" and "Kennedy" separately  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } public void testGetBestFragmentsFilteredPhraseQuery() throws Exception {  RangeFilter rf=new RangeFilter("contents","john","john",true,true);  PhraseQuery pq=new PhraseQuery();  pq.add(new Term("contents","john"));  pq.add(new  Term("contents","kennedy"));  FilteredQuery fq=new FilteredQuery(pq,rf);

  doSearching(fq);  doStandardHighlights();  //Currently highlights "John" and "Kennedy" separately  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); }

 public void testGetBestFragmentsMultiTerm() throws Exception {  doSearching("John Kenn*");  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } public void testGetBestFragmentsWithOr() throws Exception {  doSearching("JFK OR Kennedy");  doStandardHighlights();  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); }

 public void testGetBestSingleFragment() throws Exception {  doSearching("Kennedy");  Highlighter highlighter =new Highlighter(this,new QueryScorer(query));  highlighter.setTextFragmenter(new SimpleFragmenter(40));

  for (int i = 0; i < hits.length(); i++)  {   String text = hits.doc(i).get(FIELD_NAME);   TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));   String result = highlighter.getBestFragment(tokenStream,text);   System.out.println("\t" + result);  }  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);

  numHighlights = 0;  for (int i = 0; i < hits.length(); i++)  {      String text = hits.doc(i).get(FIELD_NAME);      highlighter.getBestFragment(analyzer, FIELD_NAME,text);  }  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);

  numHighlights = 0;  for (int i = 0; i < hits.length(); i++)  {      String text = hits.doc(i).get(FIELD_NAME);      highlighter.getBestFragments(analyzer,FIELD_NAME, text, 10);  }  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);

 }

 public void testGetBestSingleFragmentWithWeights() throws Exception {  WeightedTerm[]wTerms=new WeightedTerm[2];  wTerms[0]=new WeightedTerm(10f,"hello");  wTerms[1]=new WeightedTerm(1f,"kennedy");  Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));  TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));  highlighter.setTextFragmenter(new SimpleFragmenter(2));

  String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();  assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]"   , "Hello".equals(result));

  //readjust weights  wTerms[1].setWeight(50f);  tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));  highlighter =new Highlighter(new QueryScorer(wTerms));  highlighter.setTextFragmenter(new SimpleFragmenter(2));

  result = highlighter.getBestFragment(tokenStream,texts[0]).trim();  assertTrue("Failed to find best section using weighted terms. Found: "+result   , "kennedy".equals(result)); }

 // tests a "complex" analyzer that produces multiple  // overlapping tokens  public void testOverlapAnalyzer() throws Exception {  HashMap synonyms = new HashMap();  synonyms.put("football", "soccer,footie");  Analyzer analyzer = new SynonymAnalyzer(synonyms);  String srchkey = "football";

  String s = "football-soccer in the euro 2004 footie competition";  QueryParser parser=new QueryParser("bookid",analyzer);  Query query = parser.parse(srchkey);

  Highlighter highlighter = new Highlighter(new QueryScorer(query));  TokenStream tokenStream =   analyzer.tokenStream(null, new StringReader(s));  // Get 3 best fragments and seperate with a "..."  String result = highlighter.getBestFragments(tokenStream, s, 3, "...");  String expectedResult="football-soccer in the euro 2004 footie competition";  assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result)); }

 public void testGetSimpleHighlight() throws Exception {  doSearching("Kennedy");  Highlighter highlighter =   new Highlighter(this,new QueryScorer(query));

  for (int i = 0; i < hits.length(); i++)  {   String text = hits.doc(i).get(FIELD_NAME);   TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

   String result = highlighter.getBestFragment(tokenStream,text);   System.out.println("\t" + result);  }  assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); }

 public void testGetTextFragments() throws Exception {  doSearching("Kennedy");  Highlighter highlighter =   new Highlighter(this,new QueryScorer(query));  highlighter.setTextFragmenter(new SimpleFragmenter(20));

  for (int i = 0; i < hits.length(); i++)  {   String text = hits.doc(i).get(FIELD_NAME);   TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));

   String stringResults[] = highlighter.getBestFragments(tokenStream,text,10);

   tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));   TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream,text,true,10);

   assertTrue("Failed to find correct number of text Fragments: " +     fragmentResults.length + " vs "+ stringResults.length, fragmentResults.length==stringResults.length);   for (int j = 0; j < stringResults.length; j++)    {    System.out.println(fragmentResults[j]);    assertTrue("Failed to find same text Fragments: " +      fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j]));

   }

  } }

 public void testMaxSizeHighlight() throws Exception {  doSearching("meat");  Highlighter highlighter =   new Highlighter(this,new QueryScorer(query));  highlighter.setMaxDocBytesToAnalyze(30);  TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));  highlighter.getBestFragment(tokenStream,texts[0]);  assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +   "us from finding matches for this record: " + numHighlights +    " found", numHighlights == 0); } public void testMaxSizeHighlightTruncates() throws IOException  {     String goodWord="goodtoken";     String stopWords[]={"stoppedtoken"};

     TermQuery query= new TermQuery( new Term( "data", goodWord ));     SimpleHTMLFormatter fm=new SimpleHTMLFormatter();     Highlighter hg = new Highlighter(fm, new QueryScorer( query ));     hg.setTextFragmenter( new NullFragmenter() );

     String match = null;     StringBuffer sb=new StringBuffer();     sb.append(goodWord);     for(int i=0;i<10000;i++)     {      sb.append(" ");      sb.append(stopWords[0]);     }

     hg.setMaxDocBytesToAnalyze(100);     match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString());     assertTrue("Matched text should be no more than 100 chars in length ",        match.length()4\" claims article";        //run the highlighter on the raw content (scorer does not score any tokens for         // highlighting but scores a single fragment for selection        Highlighter highlighter = new Highlighter(this,                new SimpleHTMLEncoder(), new Scorer()                {                    public void startFragment(TextFragment newFragment)                    {                    }                    public float getTokenScore(Token token)                    {                        return 0;                    }                    public float getFragmentScore()                    {                        return 1;                    }                });        highlighter.setTextFragmenter(new SimpleFragmenter(2000));        TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,                new StringReader(rawDocContent));

        String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,"");        //An ugly bit of XML creation:        String xhtml="\n"+              "\n"+              "\n"+              "\n"+              "\n"+              "\n"+              "\n"+              "

"+encodedSnippet+"

\n"+ "\n"+ ""; //now an ugly built of XML parsing to test the snippet is encoded OK DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes())); Element root=doc.getDocumentElement(); NodeList nodes=root.getElementsByTagName("body"); Element body=(Element) nodes.item(0); nodes=body.getElementsByTagName("h2"); Element h2=(Element) nodes.item(0); String decodedSnippet=h2.getFirstChild().getNodeValue(); assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet); } public void testMultiSearcher() throws Exception { //setup index 1 RAMDirectory ramDir1 = new RAMDirectory(); IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true); Document d = new Document(); Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.TOKENIZED); d.add(f); writer1.addDocument(d); writer1.optimize(); writer1.close(); IndexReader reader1 = IndexReader.open(ramDir1); //setup index 2 RAMDirectory ramDir2 = new RAMDirectory(); IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true); d = new Document(); f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED); d.add(f); writer2.addDocument(d); writer2.optimize(); writer2.close(); IndexReader reader2 = IndexReader.open(ramDir2); IndexSearcher searchers[]=new IndexSearcher[2]; searchers[0] = new IndexSearcher(ramDir1); searchers[1] = new IndexSearcher(ramDir2); MultiSearcher multiSearcher=new MultiSearcher(searchers); QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer()); query = parser.parse("multi*"); System.out.println("Searching for: " + query.toString(FIELD_NAME)); //at this point the multisearcher calls combine(query[]) hits = multiSearcher.search(query); //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); Query expandedQueries[]=new Query[2]; expandedQueries[0]=query.rewrite(reader1); expandedQueries[1]=query.rewrite(reader2); query=query.combine(expandedQueries); //create an instance of the highlighter with the tags used to surround highlighted text Highlighter highlighter = new Highlighter(this,new QueryScorer(query)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); String highlightedText = highlighter.getBestFragment(tokenStream,text); System.out.println(highlightedText); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } public void testFieldSpecificHighlighting() throws IOException, ParseException { String docMainText="fred is one of the people"; QueryParser parser=new QueryParser(FIELD_NAME,analyzer); Query query=parser.parse("fred category:people"); //highlighting respects fieldnames used in query QueryScorer fieldSpecificScorer=new QueryScorer(query, "contents"); Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(),fieldSpecificScorer); fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter()); String result=fieldSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText); assertEquals("Should match",result,"fred is one of the people"); //highlighting does not respect fieldnames used in query QueryScorer fieldInSpecificScorer=new QueryScorer(query); Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(),fieldInSpecificScorer); fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter()); result=fieldInSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText); assertEquals("Should match",result,"fred is one of the people"); reader.close(); } protected TokenStream getTS2() { //String s = "Hi-Speed10 foo"; return new TokenStream() { Iterator iter; List lst; { lst = new ArrayList(); Token t; t = new Token("hi",0,2); lst.add(t); t = new Token("hispeed",0,8); lst.add(t); t = new Token("speed",3,8); t.setPositionIncrement(0); lst.add(t); t = new Token("10",8,10); lst.add(t); t = new Token("foo",11,14); lst.add(t); iter = lst.iterator(); } public Token next() throws IOException { return iter.hasNext() ? (Token)iter.next() : null; } }; } // same token-stream as above, but the bigger token comes first this time protected TokenStream getTS2a() { //String s = "Hi-Speed10 foo"; return new TokenStream() { Iterator iter; List lst; { lst = new ArrayList(); Token t; t = new Token("hispeed",0,8); lst.add(t); t = new Token("hi",0,2); t.setPositionIncrement(0); lst.add(t); t = new Token("speed",3,8); lst.add(t); t = new Token("10",8,10); lst.add(t); t = new Token("foo",11,14); lst.add(t); iter = lst.iterator(); } public Token next() throws IOException { return iter.hasNext() ? (Token)iter.next() : null; } }; } public void testOverlapAnalyzer2() throws Exception { String s = "Hi-Speed10 foo"; Query query; Highlighter highlighter; String result; query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); /////////////////// same tests, just put the bigger overlapping token first query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed"); highlighter = new Highlighter(new QueryScorer(query)); result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); assertEquals("Hi-Speed10 foo",result); } /* public void testBigramAnalyzer() throws IOException, ParseException { //test to ensure analyzers with none-consecutive start/end offsets //dont double-highlight text //setup index 1 RAMDirectory ramDir = new RAMDirectory(); Analyzer bigramAnalyzer=new CJKAnalyzer(); IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true); Document d = new Document(); Field f = new Field(FIELD_NAME, "java abc def", true, true, true); d.add(f); writer.addDocument(d); writer.close(); IndexReader reader = IndexReader.open(ramDir); IndexSearcher searcher=new IndexSearcher(reader); query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer); System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits = searcher.search(query); Highlighter highlighter = new Highlighter(this,new QueryFragmentScorer(query)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text)); String highlightedText = highlighter.getBestFragment(tokenStream,text); System.out.println(highlightedText); } }*/ public String highlightTerm(String originalText , TokenGroup group) { if(group.getTotalScore()<=0) { return originalText; } numHighlights++; //update stats used in assertions return "" + originalText + ""; } public void doSearching(String queryString) throws Exception { QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer()); query = parser.parse(queryString); doSearching(query); } public void doSearching(Query unReWrittenQuery) throws Exception { searcher = new IndexSearcher(ramDir); //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! query=unReWrittenQuery.rewrite(reader); System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits = searcher.search(query); } void doStandardHighlights() throws Exception { Highlighter highlighter =new Highlighter(this,new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(20)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); String result = highlighter.getBestFragments( tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\t" + result); } } /* * @see TestCase#setUp() */ protected void setUp() throws Exception { ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true); for (int i = 0; i < texts.length; i++) { addDoc(writer, texts[i]); } writer.optimize(); writer.close(); reader = IndexReader.open(ramDir); numHighlights = 0; } private void addDoc(IndexWriter writer, String text) throws IOException { Document d = new Document(); Field f = new Field(FIELD_NAME, text,Field.Store.YES, Field.Index.TOKENIZED); d.add(f); writer.addDocument(d); } /* * @see TestCase#tearDown() */ protected void tearDown() throws Exception { super.tearDown(); } } //===================================================================//========== BEGIN TEST SUPPORTING CLASSES//========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE//========== MADE MORE GENERALLY USEFUL.// TODO - make synonyms all interchangeable with each other and produce// a version that does hyponyms - the "is a specialised type of ...."// so that car = audi, bmw and volkswagen but bmw != audi so different// behaviour to synonyms//=================================================================== class SynonymAnalyzer extends Analyzer{ private Map synonyms; public SynonymAnalyzer(Map synonyms) { this.synonyms = synonyms; } /* (non-Javadoc) * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) */ public TokenStream tokenStream(String arg0, Reader arg1) { return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); }} /** * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer) * @author MAHarwood */class SynonymTokenizer extends TokenStream{ private TokenStream realStream; private Token currentRealToken = null; private Map synonyms; StringTokenizer st = null; public SynonymTokenizer(TokenStream realStream, Map synonyms) { this.realStream = realStream; this.synonyms = synonyms; } public Token next() throws IOException { if (currentRealToken == null) { Token nextRealToken = realStream.next(); if (nextRealToken == null) { return null; } String expansions = (String) synonyms.get(nextRealToken.termText()); if (expansions == null) { return nextRealToken; } st = new StringTokenizer(expansions, ","); if (st.hasMoreTokens()) { currentRealToken = nextRealToken; } return currentRealToken; } else { String nextExpandedValue = st.nextToken(); Token expandedToken = new Token( nextExpandedValue, currentRealToken.startOffset(), currentRealToken.endOffset()); expandedToken.setPositionIncrement(0); if (!st.hasMoreTokens()) { currentRealToken = null; st = null; } return expandedToken; } } }

]]>

Comments (0)

java 分页类[o]

Filed under: Uncategorized — 佳音 @ 5:09 下午

import java.util.ArrayList;
import java.util.List;

public class Pager {

Comments (0)

过滤 脑残体中的控制字符

Filed under: Uncategorized — 佳音 @ 10:05 上午

$mysql = mysqli_connect(“localhost”, “user”, “password”, “blog”);
$result = $mysql->query(“SET NAMES gbk”);
$result = $mysql->query(“SELECT * FROM blog_clip where deleted

Comments (0)

2007/08/07

PHP中GBK和UTF8编码处理[http://www.fulin.org/blog/2007/05/29/php-utf8-gbk/]

Filed under: Uncategorized — 佳音 @ 10:41 上午

当我用java 解析带有特殊字符(同事称为 简体繁体 脑残体)(比如说\z)的时候就会保存
在网上看到这篇文章 记录下来 用做过滤非显示字符

一、编码范围1. GBK (GB2312/GB18030)
\x00-\xff GBK双字节编码范围
\x20-\x7f ASCII
\xa1-\xff 中文
\x80-\xff 中文

2. UTF-8 (Unicode)
\u4e00-\u9fa5 (中文)
\x3130-\x318F (韩文
\xAC00-\xD7A3 (韩文)
\u0800-\u4e00 (日文)
ps: 韩文是大于[\u9fa5]的字符
正则例子:

  1.  
  2. preg_replace(”/([\x80-\xff])/”,”",$str);
  3. preg_replace(”/([u4e00-u9fa5])/”,”",$str);
  4.  

二、代码例子

  1.  
  2. //判断内容里有没有中文-GBK (PHP)
  3. function check_is_chinese($s){
  4.     return preg_match(‘/[\x80-\xff]./’, $s);
  5. }
  6. //获取字符串长度-GBK (PHP)
  7. function gb_strlen($str){
  8.     $count = 0;
  9.     for($i=0; $i<strlen($str); $i++){
  10.         $s = substr($str, $i, 1);
  11.         if (preg_match("/[\x80-\xff]/", $s)) ++$i;
  12.           ++$count;
  13.     }
  14.     return $count;
  15. }
  16. //截取字符串字串-GBK (PHP)
  17. function gb_substr($str, $len){
  18.     $count = 0;
  19.     for($i=0; $i<strlen($str); $i++){
  20.         if($count == $len) break;
  21.         if(preg_match("/[\x80-\xff]/", substr($str, $i, 1))) ++$i;
  22.           ++$count;        
  23.     }
  24.     return substr($str, 0, $i);
  25. }
  26. //统计字符串长度-UTF8 (PHP)
  27. function utf8_strlen($str) {
  28.     $count = 0;
  29.     for($i = 0; $i < strlen($str); $i++){
  30.         $value = ord($str[$i]);
  31.         if($value > 127) {
  32.             $count++;
  33.             if($value >= 192 && $value <= 223) $i++;
  34.             elseif($value >= 224 && $value <= 239) $i = $i + 2;
  35.             elseif($value >= 240 && $value <= 247) $i = $i + 3;
  36.             else die(‘Not a UTF-8 compatible string’);
  37.         }
  38.         $count++;
  39.     }
  40.     return $count;
  41. }
  42. //截取字符串-UTF8(PHP)
  43. function utf8_substr($str,$position,$length){
  44.     $start_position = strlen($str);
  45.     $start_byte = 0;
  46.     $end_position = strlen($str);
  47.     $count = 0;
  48.     for($i = 0; $i < strlen($str); $i++){
  49.         if($count >= $position && $start_position > $i){
  50.             $start_position = $i;
  51.             $start_byte = $count;
  52.         }
  53.         if(($count-$start_byte)>=$length) {
  54.             $end_position = $i;
  55.             break;
  56.         }    
  57.         $value = ord($str[$i]);
  58.         if($value > 127){
  59.             $count++;
  60.             if($value >= 192 && $value <= 223) $i++;
  61.             elseif($value >= 224 && $value <= 239) $i = $i + 2;
  62.             elseif($value >= 240 && $value <= 247) $i = $i + 3;
  63.             else die(‘Not a UTF-8 compatible string’);
  64.         }
  65.         $count++;
  66.     }
  67.     return(substr($str,$start_position,$end_position-$start_position));
  68. }
  69.  

=============

  1.  
  2. //判断是否是有韩文-UTF-8 (JavaScript)
  3. function checkKoreaChar(str) {
  4.     for(i=0; i<str.length; i++) {
  5.         if(((str.charCodeAt(i) > 0×3130 && str.charCodeAt(i) < 0x318F) || (str.charCodeAt(i) >= 0xAC00 && str.charCodeAt(i) <= 0xD7A3))) {
  6.             return true;
  7.         }
  8.     }
  9.     return false;
  10. }
  11. //判断是否有中文字符-GBK (JavaScript)
  12. function check_chinese_char(s){
  13.     return (s.length != s.replace(/[^\x00-\xff]/g,"**").length);
  14. }
  15.  
Comments (0)

2007/08/02

用Lucene实现在检索结果中再检索[http://hi.baidu.com/injava/blog/item/8321960a5900cd3cb0351d1f.html]

Filed under: Uncategorized — 佳音 @ 3:07 下午

Lucene是可以做到的,利用lucene的Filter,具体可以查看lucene的api中的org.apache.lucene.search.CachingWrapperFilter,它可以缓存上次的搜索结果,从而实现在结果中的搜索。

测试实例:
package com.wsjava;
import java.io.IOException;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.CachingWrapperFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryFilter;

public class IndexTest {

Comments (0)

文件 读取 加速

Filed under: Uncategorized — 佳音 @ 1:15 下午

最初,使用 RAMDisk 是通过在 ram 设备上建立文件系统并挂载来实现的。
# mkdir /mnt/ramfs
然后把下面的指令加入 /etc/rc.local 。
/sbin/mkfs -t ext3 /dev/ram0
/bin/mount /dev/ram0 /mnt/ramfs
/bin/chmod -R 1777 /mnt/ramfs
RAMDisk 的默认大小是 8M (可以在编译内核时设定),如果要增加 RAMDisk 的大小,需要在系统启动时给内核传参数,在 grub.conf 的 kernel 行末加上 ramdisk_size=131072 ,这里不能是 128M 这样的写法,应该是 128*1024 。
使用 ram 设备的灵活性不够,不方便。之后就使用 shm 了。
以 /tmp 绑定到 shm 为例,可以把下面的指令加入 /etc/rc.local 。
mkdir /dev/shm/tmp
chmod 1777 /dev/shm/tmp
mount –bind /dev/shm/tmp /tmp
使用 shm 要稍微方便些了,不过还是不够,更方便的办法是通过 mount 使用 tmpfs 文件系统。
# mkdir /mnt/tmpfs
# mount -t tmpfs -o size=128m,mode=1777 tmpfs /mnt/tmpfs
这里的 128m 就是 128Mb 的意思。
卸载:# umount /mnt/tmpfs
可以添加相应的设置到 /etc/fstab 开机自动挂载。
Debian 系统在启动时会清空 /tmp 目录,让 /tmp 使用 tmpfs 再合适不过了,这样还可以提高系统性能。
编辑 /etc/fstab,加入或修改 /tmp 这行:
tmpfs

« Newer Posts

Powered by 00RZ