henry

@henry@henry.sanger.io

I’m working on a project which requires a headless browser to parse JavaScript. I’ve tried a few different headless browsers, and none of them managed to parse JavaScript correctly.

Since I couldn’t find a working solution, I had to make it myself. That solution is a wrapper for JavaFX’s WebView. It’s surprisingly short, and I made a basic version of the below code in around 5 minutes:

import java.util.concurrent.CountDownLatch;

import javafx.application.Platform;
import javafx.concurrent.Worker;
import javafx.embed.swing.JFXPanel;
import javafx.scene.web.WebView;

public class HeadlessBrowser {
	
	// Internal WebView, used to load web pages
	WebView view;
	
	// Constructor
	public HeadlessBrowser() {
		new JFXPanel(); // Initializes JavaFX
		
		// Instantiates the WebView. Using Platform.runLate because this has to be done on the JavaFX application thread
		runLater(() -> view = new WebView());
	}
	
	// Navigates to the given URL.
	public void navigate(String url) {
		runLater(() -> view.getEngine().load(url));
	}
	
	// Used by the getHTML() method.
	// This variable can't be in the method because Platform.runLater is used.
	String html;
	
	// Gets the HTML of the current page.
	public String getHTML() {
		
		// The HTML is retrieved by executing some JavaScript. Once again, this can't be done on the main thread.
		runLater(() -> html = (String) view.getEngine().executeScript("document.documentElement.outerHTML"));
		
		// Returns the HTML, which is stored in a variable outside of the method
		return html;
	}
	
	// Makes the given Runnable run when the page is done loading.
	public void navigateAndWait(String url) {
		final CountDownLatch doneLatch = new CountDownLatch(1);
		runLater(() -> {
			view.getEngine().getLoadWorker().stateProperty().addListener((ov, t, t1) -> {
				if(t1 == Worker.State.SUCCEEDED) {
					doneLatch.countDown();
				}
			});
		});
		navigate(url);
		try { doneLatch.await(); } catch (InterruptedException e) {}
	}
	
	// Stops JavaFX and the headless browser.
	// Maybe not the best way to do this, but frees up resources.
	public void close() {
		Platform.exit();
	}
	
	
	// Platform.runLater, but blocks until the Runnable has been run
	private void runLater(Runnable run) {
		// queue on JavaFX thread and wait for completion
	    final CountDownLatch doneLatch = new CountDownLatch(1);
	    Platform.runLater(() -> {
	        try {
	            run.run();
	        } finally {
	            doneLatch.countDown();
	        }
	    });
	    try { doneLatch.await(); } catch (InterruptedException e) {}
	}
 
}

Here’s how you can use the class:

public class HeadlessBrowserTest {
	
	public static void main(String[] args) {
		HeadlessBrowser browser = new HeadlessBrowser();
		browser.navigateAndWait("https://www.example.com");
		System.out.println(browser.getHTML());
		browser.close();
	}

}
You are not logged in. Please either log in or subscribe for the following link
Like a post to share the love.
Log in Sign up
Reply to join the conversation.
Log in Sign up