steverobbins/magescan

Idea for better version detection

gwillem opened this issue · 7 comments

I've downloaded all magento versions and made a programmatic analysis of md5sum distribution of {js, skin, media} files among releases. This yielded:

{
    "skin/adminhtml/default/default/boxes.css": {
        "84b67457247969a206456565111c456b": "CE 1.1.4", 
        "d0511b190cdddf865cca7873917f9a69": "CE 1.1.1", 
        "a2c7f9ddda846ba76220d7bcbe85c985": "CE 1.2.1", 
        "1cbeca223c2e15dcaf500caa5d05b4ed": "CE 1.7.0.0"
    }, 
    "js/varien/product.js": {
        "6af30941970891608b0be568896946db": "CE 1.2.0"
    }, 
    "js/mage/adminhtml/sales.js": {
        "839ead52e82a2041f937389445b8db04": "CE 1.3.3.0", 
        "bdacf81a3cf7121d7a20eaa266a684ec": "CE 1.5.1.0", 
        "d80c40eeef3ca62eb4243443fe41705e": "CE 1.5.0.1", 
        "48d609bb2958b93d7254c13957b704c4": "CE 1.6.1.0", 
        "a86ad3ba7ab64bf9b3d7d2b9861d93dc": "CE 1.0", 
        "a0436f1eee62dded68e0ec860baeb699": "CE 1.9.1.0", 
        "26c8fd113b4e51aeffe200ce7880b67a": "CE 1.8.0.0", 
        "5656a8c1c646afaaf260a130fe405691": "CE 1.8.1.0", 
        "95e730c4316669f2df71031d5439df21": "CE 1.1.0", 
        "17da0470950e8dd4b30ccb787b1605f5": "CE 1.1.6", 
        "5112f328e291234a943684928ebd3d33": "CE 1.1.7", 
        "c8dd0fd8fa3faa9b9f0dd767b5a2c995": "CE 1.9.1.1", 
        "a4296235ba7ad200dd042fa5200c11b0": "CE 1.6.0.0", 
        "d1bfb9f8d4c83e4a6a826d2356a97fd7": "CE 1.3.1.1", 
        "4422dffc16da547c671b086938656397": "CE 1.4.2.0", 
        "0e400488c83e63110da75534f49f23f3": "CE 1.3.2.1"
    }, 
    "js/mage/adminhtml/product.js": {
        "e887acfc2f7af09e04f8e99ac6f7180d": "CE 1.3.0"
    }, 
    "skin/frontend/rwd/default/css/styles.css": {
        "bf6c8e2ba2fc5162dd5187b39626a3a0": "CE 1.9.0.1", 
        "8a874fcb6cdcb82947ee4dbbe1822f3e": "CE 1.9.0.0"
    }, 
    "js/prototype/validation.js": {
        "295494d0966637bdd03e4ec17c2f338c": "CE 1.4.1.0", 
        "d3252becf15108532d21d45dced96d53": "CE 1.4.1.1"
    }, 
    "js/mage/adminhtml/tools.js": {
        "ea81bcf8d9b8fcddb27fb9ec7f801172": "CE 1.3.2.2", 
        "86bbebe2745581cd8f613ceb5ef82269": "CE 1.7.0.1", 
        "d594237950932b9a3948288a020df1ba": "CE 1.3.2.4"
    }, 
    "js/lib/flex.js": {
        "4040182326f3836f98acabfe1d507960": "CE 1.4.0.1", 
        "eb84fc6c93a9d27823dde31946be8767": "CE 1.4.0.0"
    }
}

It's not perfect, as some (minor) versions don't have a unique file+hash combination under js/skin/media, but for the majority it works.

It's a clever idea but there are some potential "gotchas" to keep in mind:

  • Some sites might use Windows line endings while others use Linux endings, so you'd end up with a different MD5 hash for each type. You'd therefore need two hashes for each file.
  • EE will either have the same hashes or completely different ones. If the former, you'll need some way to differentiate between CE and EE. If the latter, you'll want to include those hashes too.

Lastly, this suffers from the same issue as the current detection mechanism - this doesn't work for minified files.

Nevertheless, I kinda like this idea and look forward to seeing what @steverobbins thinks.

I really like this idea.

@colinodell brings up good points. Some testing in the wild is needed.

This could be incorporated in addition to the existing detection. Yours would run first since it has finite versions, but if no match is found it could move on to the next check.

I'd also like to add in parsing the version from /downloader url if available, or checking if LICENSE_EE.txt exists.

This hash been added to dev and is present on the web page: http://magescan.project.steverobbins.name/?url=demo.magentocommerce.com

I'm going to monitor results for a while and make sure this is working as expected.

@steverobbins Wow, that's a very quick implementation!

Re gotchas:

  • Newline variations: either store multiple md5sums as @colinodell said or normalize at runtime before hashing (s/\r//)
  • EE > would you know where I could download all EE versions?
  • Another gotcha: I would suspect that many have modified/removed skin/frontend/rwd/default/css/styles.css but testing will show.

Oops. there was an error in my analysis logic. See here for the updated code: https://github.com/gwillem/magento-version-identification/blob/master/find_unique_checksums.py

The new hashes are:

{
    "skin/adminhtml/default/default/boxes.css": {
        "84b67457247969a206456565111c456b": "CE 1.1.2, CE 1.1.3, CE 1.1.4", 
        "d0511b190cdddf865cca7873917f9a69": "CE 1.1.1", 
        "a2c7f9ddda846ba76220d7bcbe85c985": "CE 1.2.1, CE 1.2.1.1, CE 1.2.1.2", 
        "1cbeca223c2e15dcaf500caa5d05b4ed": "CE 1.7.0.0"
    }, 
    "js/varien/product.js": {
        "6af30941970891608b0be568896946db": "CE 1.2.0, CE 1.2.0.1, CE 1.2.0.2, CE 1.2.0.3"
    }, 
    "js/mage/adminhtml/sales.js": {
        "17da0470950e8dd4b30ccb787b1605f5": "CE 1.1.5, CE 1.1.6", 
        "48d609bb2958b93d7254c13957b704c4": "CE 1.6.1.0, CE 1.6.2.0", 
        "a86ad3ba7ab64bf9b3d7d2b9861d93dc": "CE 1.0", 
        "a0436f1eee62dded68e0ec860baeb699": "CE 1.9.1.0", 
        "d80c40eeef3ca62eb4243443fe41705e": "CE 1.5.0.1", 
        "5656a8c1c646afaaf260a130fe405691": "CE 1.8.1.0", 
        "95e730c4316669f2df71031d5439df21": "CE 1.1.0", 
        "bdacf81a3cf7121d7a20eaa266a684ec": "CE 1.5.1.0", 
        "c8dd0fd8fa3faa9b9f0dd767b5a2c995": "CE 1.9.1.1", 
        "5112f328e291234a943684928ebd3d33": "CE 1.1.7, CE 1.1.8", 
        "26c8fd113b4e51aeffe200ce7880b67a": "CE 1.8.0.0", 
        "a4296235ba7ad200dd042fa5200c11b0": "CE 1.6.0.0", 
        "839ead52e82a2041f937389445b8db04": "CE 1.3.3.0", 
        "d1bfb9f8d4c83e4a6a826d2356a97fd7": "CE 1.3.1, CE 1.3.1.1", 
        "4422dffc16da547c671b086938656397": "CE 1.4.2.0", 
        "0e400488c83e63110da75534f49f23f3": "CE 1.3.2, CE 1.3.2.1, CE 1.3.2.2, CE 1.3.2.3, CE 1.3.2.4"
    }, 
    "js/mage/adminhtml/product.js": {
        "e887acfc2f7af09e04f8e99ac6f7180d": "CE 1.3.0"
    }, 
    "skin/frontend/rwd/default/css/styles.css": {
        "bf6c8e2ba2fc5162dd5187b39626a3a0": "CE 1.9.0.1", 
        "8a874fcb6cdcb82947ee4dbbe1822f3e": "CE 1.9.0.0"
    }, 
    "js/mage/adminhtml/tools.js": {
        "86bbebe2745581cd8f613ceb5ef82269": "CE 1.7.0.1, CE 1.7.0.2", 
        "ea81bcf8d9b8fcddb27fb9ec7f801172": "CE 1.3.2.2", 
        "d594237950932b9a3948288a020df1ba": "CE 1.3.2.3, CE 1.3.2.4, CE 1.3.3.0"
    }, 
    "js/lib/flex.js": {
        "4040182326f3836f98acabfe1d507960": "CE 1.4.0.1", 
        "eb84fc6c93a9d27823dde31946be8767": "CE 1.4.0.0"
    }, 
    "js/prototype/validation.js": {
        "295494d0966637bdd03e4ec17c2f338c": "CE 1.4.1.0", 
        "d3252becf15108532d21d45dced96d53": "CE 1.4.1.1"
    }
}

Todo: automatically determine composite unique hashes, that will identify the remaining such as 1.3.3.0

I've added in some Enterprise versions (both in @gwillem's repo and here).

This has been released (v.1.5)