@article {168, title = {Activity outcome measurement for postacute care}, journal = {Medical Care}, volume = {42}, number = {1 Suppl}, year = {2004}, note = {0025-7079Journal ArticleMulticenter Study}, pages = {I49-161}, abstract = {BACKGROUND: Efforts to evaluate the effectiveness of a broad range of postacute care services have been hindered by the lack of conceptually sound and comprehensive measures of outcomes. It is critical to determine a common underlying structure before employing current methods of item equating across outcome instruments for future item banking and computer-adaptive testing applications. OBJECTIVE: To investigate the factor structure, reliability, and scale properties of items underlying the Activity domains of the International Classification of Functioning, Disability and Health (ICF) for use in postacute care outcome measurement. METHODS: We developed a 41-item Activity Measure for Postacute Care (AM-PAC) that assessed an individual{\textquoteright}s execution of discrete daily tasks in his or her own environment across major content domains as defined by the ICF. We evaluated the reliability and discriminant validity of the prototype AM-PAC in 477 individuals in active rehabilitation programs across 4 rehabilitation settings using factor analyses, tests of item scaling, internal consistency reliability analyses, Rasch item response theory modeling, residual component analysis, and modified parallel analysis. RESULTS: Results from an initial exploratory factor analysis produced 3 distinct, interpretable factors that accounted for 72\% of the variance: Applied Cognition (44\%), Personal Care \& Instrumental Activities (19\%), and Physical \& Movement Activities (9\%); these 3 activity factors were verified by a confirmatory factor analysis. Scaling assumptions were met for each factor in the total sample and across diagnostic groups. Internal consistency reliability was high for the total sample (Cronbach alpha = 0.92 to 0.94), and for specific diagnostic groups (Cronbach alpha = 0.90 to 0.95). Rasch scaling, residual factor, differential item functioning, and modified parallel analyses supported the unidimensionality and goodness of fit of each unique activity domain. CONCLUSIONS: This 3-factor model of the AM-PAC can form the conceptual basis for common-item equating and computer-adaptive applications, leading to a comprehensive system of outcome instruments for postacute care settings.}, keywords = {*Self Efficacy, *Sickness Impact Profile, Activities of Daily Living/*classification/psychology, Adult, Aftercare/*standards/statistics \& numerical data, Aged, Boston, Cognition/physiology, Disability Evaluation, Factor Analysis, Statistical, Female, Human, Male, Middle Aged, Movement/physiology, Outcome Assessment (Health Care)/*methods/statistics \& numerical data, Psychometrics, Questionnaires/standards, Rehabilitation/*standards/statistics \& numerical data, Reproducibility of Results, Sensitivity and Specificity, Support, U.S. Gov{\textquoteright}t, Non-P.H.S., Support, U.S. Gov{\textquoteright}t, P.H.S.}, author = {Haley, S. M. and Coster, W. J. and Andres, P. L. and Ludlow, L. H. and Ni, P. and Bond, T. L. and Sinclair, S. J. and Jette, A. M.} } @article {381, title = {Pre-equating: a simulation study based on a large scale assessment model}, journal = {Journal of Applied Measurement}, volume = {5}, number = {3}, year = {2004}, note = {1529-7713Journal Article}, pages = {301-18}, abstract = {Although post-equating (PE) has proven to be an acceptable method in the scaling and equating of items and forms, there are times when the turn-around period for equating and converting raw scores to scale scores is so small that PE cannot be undertaken within the prescribed time frame. In such cases, pre-equating (PrE) could be considered as an acceptable alternative. Assessing the feasibility of using item calibrations from the item bank (as in PrE) is conditioned on the equivalency of the calibrations and the errors associated with it vis a vis the results obtained via PE. This paper creates item banks over three periods of item introduction into the banks and uses the Rasch model in examining data with respect to the recovery of item parameters, the measurement error, and the effect cut-points have on examinee placement in both the PrE and PE situations. Results indicate that PrE is a viable solution to PE provided the stability of the item calibrations are enhanced by using large sample sizes (perhaps as large as full-population) in populating the item bank.}, keywords = {*Databases, *Models, Theoretical, Calibration, Human, Psychometrics, Reference Values, Reproducibility of Results}, author = {Taherbhai, H. M. and Young, M. J.} } @article {30, title = {Calibration of an item pool for assessing the burden of headaches: an application of item response theory to the Headache Impact Test (HIT)}, journal = {Quality of Life Research}, volume = {12}, number = {8}, year = {2003}, note = {0962-9343Journal Article}, pages = {913-933}, abstract = {BACKGROUND: Measurement of headache impact is important in clinical trials, case detection, and the clinical monitoring of patients. Computerized adaptive testing (CAT) of headache impact has potential advantages over traditional fixed-length tests in terms of precision, relevance, real-time quality control and flexibility. OBJECTIVE: To develop an item pool that can be used for a computerized adaptive test of headache impact. METHODS: We analyzed responses to four well-known tests of headache impact from a population-based sample of recent headache sufferers (n = 1016). We used confirmatory factor analysis for categorical data and analyses based on item response theory (IRT). RESULTS: In factor analyses, we found very high correlations between the factors hypothesized by the original test constructers, both within and between the original questionnaires. These results suggest that a single score of headache impact is sufficient. We established a pool of 47 items which fitted the generalized partial credit IRT model. By simulating a computerized adaptive health test we showed that an adaptive test of only five items had a very high concordance with the score based on all items and that different worst-case item selection scenarios did not lead to bias. CONCLUSION: We have established a headache impact item pool that can be used in CAT of headache impact.}, keywords = {*Cost of Illness, *Decision Support Techniques, *Sickness Impact Profile, Adolescent, Adult, Aged, Comparative Study, Disability Evaluation, Factor Analysis, Statistical, Headache/*psychology, Health Surveys, Human, Longitudinal Studies, Middle Aged, Migraine/psychology, Models, Psychological, Psychometrics/*methods, Quality of Life/*psychology, Software, Support, Non-U.S. Gov{\textquoteright}t}, author = {Bjorner, J. B. and Kosinski, M. and Ware, J. E., Jr.} } @article {31, title = {The feasibility of applying item response theory to measures of migraine impact: a re-analysis of three clinical studies}, journal = {Quality of Life Research}, volume = {12}, number = {8}, year = {2003}, note = {0962-9343Journal Article}, pages = {887-902}, abstract = {BACKGROUND: Item response theory (IRT) is a powerful framework for analyzing multiitem scales and is central to the implementation of computerized adaptive testing. OBJECTIVES: To explain the use of IRT to examine measurement properties and to apply IRT to a questionnaire for measuring migraine impact--the Migraine Specific Questionnaire (MSQ). METHODS: Data from three clinical studies that employed the MSQ-version 1 were analyzed by confirmatory factor analysis for categorical data and by IRT modeling. RESULTS: Confirmatory factor analyses showed very high correlations between the factors hypothesized by the original test constructions. Further, high item loadings on one common factor suggest that migraine impact may be adequately assessed by only one score. IRT analyses of the MSQ were feasible and provided several suggestions as to how to improve the items and in particular the response choices. Out of 15 items, 13 showed adequate fit to the IRT model. In general, IRT scores were strongly associated with the scores proposed by the original test developers and with the total item sum score. Analysis of response consistency showed that more than 90\% of the patients answered consistently according to a unidimensional IRT model. For the remaining patients, scores on the dimension of emotional function were less strongly related to the overall IRT scores that mainly reflected role limitations. Such response patterns can be detected easily using response consistency indices. Analysis of test precision across score levels revealed that the MSQ was most precise at one standard deviation worse than the mean impact level for migraine patients that are not in treatment. Thus, gains in test precision can be achieved by developing items aimed at less severe levels of migraine impact. CONCLUSIONS: IRT proved useful for analyzing the MSQ. The approach warrants further testing in a more comprehensive item pool for headache impact that would enable computerized adaptive testing.}, keywords = {*Sickness Impact Profile, Adolescent, Adult, Aged, Comparative Study, Cost of Illness, Factor Analysis, Statistical, Feasibility Studies, Female, Human, Male, Middle Aged, Migraine/*psychology, Models, Psychological, Psychometrics/instrumentation/*methods, Quality of Life/*psychology, Questionnaires, Support, Non-U.S. Gov{\textquoteright}t}, author = {Bjorner, J. B. and Kosinski, M. and Ware, J. E., Jr.} } @article {36, title = {An examination of the comparative reliability, validity, and accuracy of performance ratings made using computerized adaptive rating scales}, journal = {Journal of Applied Psychology}, volume = {86}, number = {5}, year = {2001}, note = {214803450021-9010Journal ArticleValidation Studies}, pages = {965-973}, abstract = {This laboratory research compared the reliability, validity, and accuracy of a computerized adaptive rating scale (CARS) format and 2 relatively common and representative rating formats. The CARS is a paired-comparison rating task that uses adaptive testing principles to present pairs of scaled behavioral statements to the rater to iteratively estimate a ratee{\textquoteright}s effectiveness on 3 dimensions of contextual performance. Videotaped vignettes of 6 office workers were prepared, depicting prescripted levels of contextual performance, and 112 subjects rated these vignettes using the CARS format and one or the other competing format. Results showed 23\%-37\% lower standard errors of measurement for the CARS format. In addition, validity was significantly higher for the CARS format (d = .18), and Cronbach{\textquoteright}s accuracy coefficients showed significantly higher accuracy, with a median effect size of .08. The discussion focuses on possible reasons for the results.}, keywords = {*Computer Simulation, *Employee Performance Appraisal, *Personnel Selection, Adult, Automatic Data Processing, Female, Human, Male, Reproducibility of Results, Sensitivity and Specificity, Support, U.S. Gov{\textquoteright}t, Non-P.H.S., Task Performance and Analysis, Video Recording}, author = {Borman, W. C. and Buck, D. E. and Hanson, M. A. and Motowidlo, S. J. and Stark, S. and F Drasgow} } @article {191, title = {Item response theory and health outcomes measurement in the 21st century}, journal = {Medical Care}, volume = {38}, number = {9 Suppl II}, year = {2000}, note = {204349670025-7079Journal Article}, pages = {II28-II42}, abstract = {Item response theory (IRT) has a number of potential advantages over classical test theory in assessing self-reported health outcomes. IRT models yield invariant item and latent trait estimates (within a linear transformation), standard errors conditional on trait level, and trait estimates anchored to item content. IRT also facilitates evaluation of differential item functioning, inclusion of items with different response formats in the same scale, and assessment of person fit and is ideally suited for implementing computer adaptive testing. Finally, IRT methods can be helpful in developing better health outcome measures and in assessing change over time. These issues are reviewed, along with a discussion of some of the methodological and practical challenges in applying IRT methods.}, keywords = {*Models, Statistical, Activities of Daily Living, Data Interpretation, Statistical, Health Services Research/*methods, Health Surveys, Human, Mathematical Computing, Outcome Assessment (Health Care)/*methods, Research Design, Support, Non-U.S. Gov{\textquoteright}t, Support, U.S. Gov{\textquoteright}t, P.H.S., United States}, author = {Hays, R. D. and Morales, L. S. and Reise, S. P.} } @article {419, title = {The use of Rasch analysis to produce scale-free measurement of functional ability}, journal = {American Journal of Occupational Therapy}, volume = {53}, number = {1}, year = {1999}, note = {991250470272-9490Journal Article}, pages = {83-90}, abstract = {Innovative applications of Rasch analysis can lead to solutions for traditional measurement problems and can produce new assessment applications in occupational therapy and health care practice. First, Rasch analysis is a mechanism that translates scores across similar functional ability assessments, thus enabling the comparison of functional ability outcomes measured by different instruments. This will allow for the meaningful tracking of functional ability outcomes across the continuum of care. Second, once the item-difficulty order of an instrument or item bank is established by Rasch analysis, computerized adaptive testing can be used to target items to the patient{\textquoteright}s ability level, reducing assessment length by as much as one half. More importantly, Rasch analysis can provide the foundation for "equiprecise" measurement or the potential to have precise measurement across all levels of functional ability. The use of Rasch analysis to create scale-free measurement of functional ability demonstrates how this methodlogy can be used in practical applications of clinical and outcome assessment.}, keywords = {*Activities of Daily Living, Disabled Persons/*classification, Human, Occupational Therapy/*methods, Predictive Value of Tests, Questionnaires/standards, Sensitivity and Specificity}, author = {Velozo, C. A. and Kielhofner, G. and Lai, J-S.} } @article {177, title = {The effect of item pool restriction on the precision of ability measurement for a Rasch-based CAT: comparisons to traditional fixed length examinations}, journal = {J Outcome Meas}, volume = {2}, number = {2}, year = {1998}, note = {983263801090-655xJournal Article}, pages = {97-122}, abstract = {This paper describes a method for examining the precision of a computerized adaptive test with a limited item pool. Standard errors of measurement ascertained in the testing of simulees with a CAT using a restricted pool were compared to the results obtained in a live paper-and-pencil achievement testing of 4494 nursing students on four versions of an examination of calculations of drug administration. CAT measures of precision were considered when the simulated examine pools were uniform and normal. Precision indices were also considered in terms of the number of CAT items required to reach the precision of the traditional tests. Results suggest that regardless of the size of the item pool, CAT provides greater precision in measurement with a smaller number of items administered even when the choice of items is limited but fails to achieve equiprecision along the entire ability continuum.}, keywords = {*Decision Making, Computer-Assisted, Comparative Study, Computer Simulation, Education, Nursing, Educational Measurement/*methods, Human, Models, Statistical, Psychometrics/*methods}, author = {Halkitis, P. N.} } @article {33, title = {A computerized adaptive testing system for speech discrimination measurement: The Speech Sound Pattern Discrimination Test}, journal = {Journal of the Accoustical Society of America}, volume = {101}, number = {4}, year = {1997}, note = {972575560001-4966Journal Article}, pages = {2289-298}, abstract = {A computerized, adaptive test-delivery system for the measurement of speech discrimination, the Speech Sound Pattern Discrimination Test, is described and evaluated. Using a modified discrimination task, the testing system draws on a pool of 130 items spanning a broad range of difficulty to estimate an examinee{\textquoteright}s location along an underlying continuum of speech processing ability, yet does not require the examinee to possess a high level of English language proficiency. The system is driven by a mathematical measurement model which selects only test items which are appropriate in difficulty level for a given examinee, thereby individualizing the testing experience. Test items were administered to a sample of young deaf adults, and the adaptive testing system evaluated in terms of respondents{\textquoteright} sensory and perceptual capabilities, acoustic and phonetic dimensions of speech, and theories of speech perception. Data obtained in this study support the validity, reliability, and efficiency of this test as a measure of speech processing ability.}, keywords = {*Diagnosis, Computer-Assisted, *Speech Discrimination Tests, *Speech Perception, Adolescent, Adult, Audiometry, Pure-Tone, Human, Middle Age, Psychometrics, Reproducibility of Results}, author = {Bochner, J. and Garrison, W. and Palmer, L. and MacKenzie, D. and Braveman, A.} }